Beispiel #1
0
def run(EvaluatorClass,
        inputCorpusFile,
        goldCorpusFile,
        parse,
        tokenization=None,
        target="both",
        entityMatchFunction=compareEntitiesSimple,
        removeIntersentenceInteractions=False,
        errorMatrix=False,
        verbose=False):
    print >> sys.stderr, "##### EvaluateInteractionXML #####"
    print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile
    # Class sets are used to convert the types to ids that the evaluator can use
    classSets = {}
    if EvaluatorClass.type == "binary":
        classSets["entity"] = IdSet(idDict={
            "True": 1,
            "False": -1
        },
                                    locked=True)
        classSets["interaction"] = IdSet(idDict={
            "True": 1,
            "False": -1
        },
                                         locked=True)
        negativeClassId = -1
    elif EvaluatorClass.type == "multiclass":
        classSets["entity"] = IdSet(idDict={"neg": 1}, locked=False)
        classSets["interaction"] = IdSet(idDict={"neg": 1}, locked=False)
        negativeClassId = 1
    else:
        sys.exit("Unknown evaluator type")

    # Load corpus and make sentence graphs
    goldCorpusElements = None
    if goldCorpusFile != None:
        goldCorpusElements = SentenceGraph.loadCorpus(
            goldCorpusFile, parse, tokenization, False,
            removeIntersentenceInteractions)
    predictedCorpusElements = SentenceGraph.loadCorpus(
        inputCorpusFile, parse, tokenization, False,
        removeIntersentenceInteractions)

    # Compare the corpora and print results on screen
    return processCorpora(EvaluatorClass,
                          predictedCorpusElements,
                          goldCorpusElements,
                          target,
                          classSets,
                          negativeClassId,
                          entityMatchFunction,
                          errorMatrix=errorMatrix,
                          verbose=verbose)
Beispiel #2
0
 def build(self, corpus, parse, tokenization=None):
     assert corpus != None
     if type(corpus) == types.StringType or isinstance(corpus, ET.ElementTree): # corpus is in file
         corpus = SentenceGraph.loadCorpus(corpus, parse, tokenization)
     
     for sentence in corpus.sentences:
         sentenceGraph = sentence.sentenceGraph
         if sentenceGraph == None:
             continue
         for t1 in sentenceGraph.tokens:
             for t2 in sentenceGraph.tokens:
                 posTuple = ( t1.get("POS"), t2.get("POS") )
                 if not self.counts.has_key(posTuple):
                     self.counts[posTuple] = {}
                 if sentenceGraph.interactionGraph.has_edge(t1, t2):
                     intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={})
                     for i in range(len(intEdges)):
                         intElement = intEdges[i]["element"]
                         intType = intElement.get("type")
                         if not self.counts[posTuple].has_key(intType):
                             self.counts[posTuple][intType] = 0
                         self.counts[posTuple][intType] += 1
                 else:
                     if not self.counts[posTuple].has_key("neg"):
                         self.counts[posTuple]["neg"] = 0
                     self.counts[posTuple]["neg"] += 1
     self.update()
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        """
        An interface for running the example builder without needing to create a class
        """
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = IntersentenceEdgeExampleBuilder(style=style,
                                                classSet=classSet,
                                                featureSet=featureSet)
        else:
            e = IntersentenceEdgeExampleBuilder(classSet=classSet,
                                                featureSet=featureSet)
        # Load documents
        if type(input) != types.ListType:
            # Load corpus and make sentence graphs
            corpusElements = SentenceGraph.loadCorpus(input, parse,
                                                      tokenization, False,
                                                      True)

        else:  # assume input is already a list of sentences
            assert (removeNameInfo == False)
            return input
        # run examplebuilder
        e.buildExamplesForDocuments(corpusElements.documentSentences, output,
                                    idFileTag)
Beispiel #4
0
    def build(self, corpus, parse, tokenization=None):
        assert corpus != None
        if type(corpus) == types.StringType or isinstance(
                corpus, ET.ElementTree):  # corpus is in file
            corpus = SentenceGraph.loadCorpus(corpus, parse, tokenization)

        for sentence in corpus.sentences:
            sentenceGraph = sentence.sentenceGraph
            if sentenceGraph == None:
                continue
            for t1 in sentenceGraph.tokens:
                for t2 in sentenceGraph.tokens:
                    posTuple = (t1.get("POS"), t2.get("POS"))
                    if not self.counts.has_key(posTuple):
                        self.counts[posTuple] = {}
                    if sentenceGraph.interactionGraph.has_edge(t1, t2):
                        intEdges = sentenceGraph.interactionGraph.get_edge_data(
                            t1, t2, default={})
                        for i in range(len(intEdges)):
                            intElement = intEdges[i]["element"]
                            intType = intElement.get("type")
                            if not self.counts[posTuple].has_key(intType):
                                self.counts[posTuple][intType] = 0
                            self.counts[posTuple][intType] += 1
                    else:
                        if not self.counts[posTuple].has_key("neg"):
                            self.counts[posTuple]["neg"] = 0
                        self.counts[posTuple]["neg"] += 1
        self.update()
Beispiel #5
0
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
    if iterate:
        from Utils.ProgressCounter import ProgressCounter
        import InteractionXML.SentenceElements as SentenceElements
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        print >> sys.stderr, "Removing existing head offsets"
        removeCount = 0
        counter = ProgressCounter(None, "Find heads")
        counter.showMilliseconds = True
        for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
            for sentence in sentences:
                if removeExisting:
                    for e in sentence.sentence.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
                graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
                graph.mapInteractions(sentence.entities, sentence.interactions)
                # Make sure every parse gets head scores
                #if graph.tokenHeadScores == None:
                #    graph.getTokenHeadScores()
            counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")                
        print >> sys.stderr, "Removed head offsets from", removeCount, "entities"    
    else:
        xml = ETUtils.ETFromObj(input)
        if removeExisting:
            print >> sys.stderr, "Removing existing head offsets"
            removeCount = 0
            xml = ETUtils.ETFromObj(input)
            for d in xml.getroot().findall("document"):
                for s in d.findall("sentence"):
                    for e in s.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
            print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
        
        # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
        
        # Make sure every parse gets head scores
        for sentence in corpusElements.sentences:
            if sentence.sentenceGraph == None:
                continue
            if sentence.sentenceGraph.tokenHeadScores == None:
                sentence.sentenceGraph.getTokenHeadScores()
        
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return xml
Beispiel #6
0
def run(EvaluatorClass, inputCorpusFile, goldCorpusFile, parse, tokenization=None, target="both", entityMatchFunction=compareEntitiesSimple, removeIntersentenceInteractions=False, errorMatrix=False, verbose=False):
    print >> sys.stderr, "##### EvaluateInteractionXML #####"
    print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile
    # Class sets are used to convert the types to ids that the evaluator can use
    classSets = {}
    if EvaluatorClass.type == "binary":
        classSets["entity"] = IdSet(idDict={"True":1,"False":-1}, locked=True)
        classSets["interaction"] = IdSet(idDict={"True":1,"False":-1}, locked=True)
        negativeClassId = -1
    elif EvaluatorClass.type == "multiclass":
        classSets["entity"] = IdSet(idDict={"neg":1}, locked=False)
        classSets["interaction"] = IdSet(idDict={"neg":1}, locked=False)
        negativeClassId = 1
    else:
        sys.exit("Unknown evaluator type")
    
    # Load corpus and make sentence graphs
    goldCorpusElements = None
    if goldCorpusFile != None:
        goldCorpusElements = SentenceGraph.loadCorpus(goldCorpusFile, parse, tokenization, False, removeIntersentenceInteractions)
    predictedCorpusElements = SentenceGraph.loadCorpus(inputCorpusFile, parse, tokenization, False, removeIntersentenceInteractions)    
    
    # Compare the corpora and print results on screen
    return processCorpora(EvaluatorClass, predictedCorpusElements, goldCorpusElements, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=errorMatrix, verbose=verbose)
        "The file to which the new XML structure is saved. If None, will be the same as target.",
        metavar="FILE")
    optparser.add_option("-t",
                         "--tokenization",
                         default=None,
                         dest="tokenization",
                         help="Tokenization element name")
    optparser.add_option("-p",
                         "--parse",
                         default=None,
                         dest="parse",
                         help="Parse element name")
    (options, args) = optparser.parse_args()

    print >> sys.stderr, "Loading input file", options.input
    corpusElements = SentenceGraph.loadCorpus(options.input, options.parse,
                                              options.tokenization)

    counter = ProgressCounter(len(corpusElements.sentences),
                              "Resolving chains")
    tags = ["e1", "e2"]
    for sentence in corpusElements.sentences:
        counter.update(
            1,
            "Resolving chains for (" + sentence.sentence.attrib["id"] + "): ")
        identityChainDict = {}
        tokenHeadScores = sentence.sentenceGraph.getTokenHeadScores()
        for interaction in sentence.interactions:
            if interaction.attrib["type"] == "identity":
                e1 = sentence.entitiesById[interaction.attrib["e1"]]
                e2 = sentence.entitiesById[interaction.attrib["e2"]]
                t1 = sentence.sentenceGraph.entityHeadTokenByEntity[e1]
Beispiel #8
0
def findHeads(input,
              parse,
              tokenization=None,
              output=None,
              removeExisting=True,
              iterate=False):
    if iterate:
        from Utils.ProgressCounter import ProgressCounter
        import InteractionXML.SentenceElements as SentenceElements
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        print >> sys.stderr, "Removing existing head offsets"
        removeCount = 0
        counter = ProgressCounter(None, "Find heads")
        counter.showMilliseconds = True
        for sentences in SentenceElements.getCorpusIterator(
                input, output, parse, tokenization):
            for sentence in sentences:
                if removeExisting:
                    for e in sentence.sentence.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
                graph = SentenceGraph.SentenceGraph(sentence.sentence,
                                                    sentence.tokens,
                                                    sentence.dependencies)
                graph.mapInteractions(sentence.entities, sentence.interactions)
                # Make sure every parse gets head scores
                #if graph.tokenHeadScores == None:
                #    graph.getTokenHeadScores()
            counter.update(
                len(sentences),
                "Finding heads (" + sentences[-1].sentence.get("id") + "): ")
        print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
    else:
        xml = ETUtils.ETFromObj(input)
        if removeExisting:
            print >> sys.stderr, "Removing existing head offsets"
            removeCount = 0
            xml = ETUtils.ETFromObj(input)
            for d in xml.getroot().findall("document"):
                for s in d.findall("sentence"):
                    for e in s.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
            print >> sys.stderr, "Removed head offsets from", removeCount, "entities"

        # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)

        # Make sure every parse gets head scores
        for sentence in corpusElements.sentences:
            if sentence.sentenceGraph == None:
                continue
            if sentence.sentenceGraph.tokenHeadScores == None:
                sentence.sentenceGraph.getTokenHeadScores()

        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return xml
Beispiel #9
0
def buildExamples(corpusDir, outPath):
    # define shortcuts for commonly used files
    PARSE = "stanford-newMC-intra"  #"split-Charniak-Lease"
    TOK = "split-McClosky"
    CORPUS_DIR = corpusDir

    # xml files without heads
    BI_DEVEL_FILE = CORPUS_DIR + "/bioinfer.devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TEST_FILE = CORPUS_DIR + "/bioinfer.test.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TRAIN_FILE = CORPUS_DIR + "/bioinfer.train.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TRAIN_AND_DEVEL_FILE = CORPUS_DIR + "/bioinfer.train+devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"

    # xml files with head tokens
    TEST_FILE = outpath + "/bioinfer-test-" + PARSE + ".xml"
    DEVEL_FILE = outpath + "/bioinfer-devel-" + PARSE + ".xml"
    TRAIN_FILE = outpath + "/bioinfer-train-" + PARSE + ".xml"
    TRAIN_AND_DEVEL_FILE = outpath + "/bioinfer-train-and-devel-" + PARSE + ".xml"
    WORKDIR = outpath

    # Find heads
    sys.path.append("..")
    import Core.SentenceGraph as SentenceGraph
    import cElementTreeUtils as ETUtils
    if not os.path.exists(TEST_FILE):
        c = SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TEST_FILE)
    if not os.path.exists(DEVEL_FILE):
        c = SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, DEVEL_FILE)
    if not os.path.exists(TRAIN_FILE):
        c = SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TRAIN_FILE)
    if not os.path.exists(TRAIN_AND_DEVEL_FILE):
        c = SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TRAIN_AND_DEVEL_FILE)

    ###############################################################################
    # Trigger example generation
    ###############################################################################
    print >> sys.stderr, "Trigger examples for parse", TOK
    if not os.path.exists("gazetteer-train-" + TOK):
        Gazetteer.run(TRAIN_FILE, "gazetteer-train-" + TOK, TOK)
    if not os.path.exists("gazetteer-train-and-devel-" + TOK):
        Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-" + TOK,
                      TOK)
    # generate the files for the old charniak
    if not os.path.exists("trigger-train-examples-" + PARSE):
        GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE,
                                            "trigger-train-examples-" + PARSE,
                                            PARSE, TOK, "style:typed",
                                            "bioinfer-trigger-ids",
                                            "gazetteer-train-" + TOK)
    if not os.path.exists("trigger-devel-examples-" + PARSE):
        GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE,
                                            "trigger-devel-examples-" + PARSE,
                                            PARSE, TOK, "style:typed",
                                            "bioinfer-trigger-ids",
                                            "gazetteer-train-" + TOK)
    if not os.path.exists("trigger-train-and-devel-examples-" + PARSE):
        GeneralEntityTypeRecognizerGztr.run(
            TRAIN_AND_DEVEL_FILE, "trigger-train-and-devel-examples-" + PARSE,
            PARSE, TOK, "style:typed", "bioinfer-trigger-ids",
            "gazetteer-train-and-devel-" + TOK)
    if not os.path.exists("trigger-test-examples-" + PARSE):
        GeneralEntityTypeRecognizerGztr.run(TEST_FILE,
                                            "trigger-test-examples-" + PARSE,
                                            PARSE, TOK, "style:typed",
                                            "bioinfer-trigger-ids",
                                            "gazetteer-train-and-devel-" + TOK)

    ###############################################################################
    # Edge example generation
    ###############################################################################
    print >> sys.stderr, "Edge examples for parse", PARSE
    EDGE_FEATURE_PARAMS = "style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits"

    if not os.path.exists("edge-train-examples-" + PARSE):
        MultiEdgeExampleBuilder.run(TRAIN_FILE, "edge-train-examples-" + PARSE,
                                    PARSE, TOK, EDGE_FEATURE_PARAMS,
                                    "bioinfer-edge-ids")
    if not os.path.exists("edge-devel-examples-" + PARSE):
        MultiEdgeExampleBuilder.run(DEVEL_FILE, "edge-devel-examples-" + PARSE,
                                    PARSE, TOK, EDGE_FEATURE_PARAMS,
                                    "bioinfer-edge-ids")
    if not os.path.exists("edge-train-and-devel-examples-" + PARSE):
        MultiEdgeExampleBuilder.run(TRAIN_AND_DEVEL_FILE,
                                    "edge-train-and-devel-examples-" + PARSE,
                                    PARSE, TOK, EDGE_FEATURE_PARAMS,
                                    "bioinfer-edge-ids")
    # NOTE! These TEST examples will be based on gold standard triggers!
    if not os.path.exists("edge-test-examples-" + PARSE):
        MultiEdgeExampleBuilder.run(TEST_FILE, "edge-test-examples-" + PARSE,
                                    PARSE, TOK, EDGE_FEATURE_PARAMS,
                                    "bioinfer-edge-ids")
Beispiel #10
0
EDGE_TEST_EXAMPLE_FILE="edge-test-examples-"+PARSE
EDGE_CLASS_NAMES="bioinfer-edge-ids.class_names"

EDGE_FEATURE_PARAMS="style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits"

if True:
    ###############################################################################
    # Head token detection
    ###############################################################################
    
    # Find heads
    sys.path.append("..")
    import Core.SentenceGraph as SentenceGraph
    import cElementTreeUtils as ETUtils
    
    ETUtils.write(SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK).rootElement, TEST_FILE)
    ETUtils.write(SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK).rootElement, DEVEL_FILE)
    ETUtils.write(SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK).rootElement, TRAIN_FILE)
    ETUtils.write(SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK).rootElement, TRAIN_AND_DEVEL_FILE)
    
    ###############################################################################
    # Trigger example generation
    ###############################################################################
    print >> sys.stderr, "Trigger examples for parse", PARSE
    Gazetteer.run(TRAIN_FILE, "gazetteer-train-"+TOK, TOK)
    Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-"+TOK, TOK)
    # Generate example files
    GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE, TRIGGER_TRAIN_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK)
    GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE, TRIGGER_DEVEL_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK)
    GeneralEntityTypeRecognizerGztr.run(TRAIN_AND_DEVEL_FILE, TRIGGER_TRAIN_AND_DEVEL_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK)
    GeneralEntityTypeRecognizerGztr.run(TEST_FILE, TRIGGER_TEST_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK)
Beispiel #11
0
def buildExamples(corpusDir, outPath):
    # define shortcuts for commonly used files
    PARSE="stanford-newMC-intra" #"split-Charniak-Lease"
    TOK="split-McClosky"
    CORPUS_DIR=corpusDir
    
    # xml files without heads
    BI_DEVEL_FILE=CORPUS_DIR+"/bioinfer.devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TEST_FILE=CORPUS_DIR+"/bioinfer.test.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TRAIN_FILE=CORPUS_DIR+"/bioinfer.train.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TRAIN_AND_DEVEL_FILE=CORPUS_DIR+"/bioinfer.train+devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    
    # xml files with head tokens
    TEST_FILE=outpath+"/bioinfer-test-"+PARSE+".xml"
    DEVEL_FILE=outpath+"/bioinfer-devel-"+PARSE+".xml"
    TRAIN_FILE=outpath+"/bioinfer-train-"+PARSE+".xml"
    TRAIN_AND_DEVEL_FILE=outpath+"/bioinfer-train-and-devel-"+PARSE+".xml"
    WORKDIR=outpath
    
    # Find heads
    sys.path.append("..")
    import Core.SentenceGraph as SentenceGraph
    import cElementTreeUtils as ETUtils
    if not os.path.exists(TEST_FILE):
        c = SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TEST_FILE)
    if not os.path.exists(DEVEL_FILE):
        c = SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, DEVEL_FILE)
    if not os.path.exists(TRAIN_FILE):
        c = SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TRAIN_FILE)
    if not os.path.exists(TRAIN_AND_DEVEL_FILE):
        c = SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TRAIN_AND_DEVEL_FILE)
    
    ###############################################################################
    # Trigger example generation
    ###############################################################################
    print >> sys.stderr, "Trigger examples for parse", TOK
    if not os.path.exists("gazetteer-train-"+TOK):
        Gazetteer.run(TRAIN_FILE, "gazetteer-train-"+TOK, TOK)
    if not os.path.exists("gazetteer-train-and-devel-"+TOK):
        Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-"+TOK, TOK)
    # generate the files for the old charniak
    if not os.path.exists("trigger-train-examples-"+PARSE):
        GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE, "trigger-train-examples-"+PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK)
    if not os.path.exists("trigger-devel-examples-"+PARSE):
        GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE, "trigger-devel-examples-"+PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK)
    if not os.path.exists("trigger-train-and-devel-examples-"+PARSE):
        GeneralEntityTypeRecognizerGztr.run(TRAIN_AND_DEVEL_FILE, "trigger-train-and-devel-examples-"+PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK)
    if not os.path.exists("trigger-test-examples-"+PARSE):
        GeneralEntityTypeRecognizerGztr.run(TEST_FILE, "trigger-test-examples-"+PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK)
    
    ###############################################################################
    # Edge example generation
    ###############################################################################
    print >> sys.stderr, "Edge examples for parse", PARSE
    EDGE_FEATURE_PARAMS="style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits"
    
    if not os.path.exists("edge-train-examples-"+PARSE):
        MultiEdgeExampleBuilder.run(TRAIN_FILE, "edge-train-examples-"+PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids")
    if not os.path.exists("edge-devel-examples-"+PARSE):
        MultiEdgeExampleBuilder.run(DEVEL_FILE, "edge-devel-examples-"+PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids")
    if not os.path.exists("edge-train-and-devel-examples-"+PARSE):
        MultiEdgeExampleBuilder.run(TRAIN_AND_DEVEL_FILE, "edge-train-and-devel-examples-"+PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids")
    # NOTE! These TEST examples will be based on gold standard triggers!
    if not os.path.exists("edge-test-examples-"+PARSE):
        MultiEdgeExampleBuilder.run(TEST_FILE, "edge-test-examples-"+PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids")
Beispiel #12
0
 def loadCorpus(self, corpus, parse, tokenization):
     if type(corpus) == types.StringType or isinstance(
             corpus, ET.ElementTree):  # corpus is in file
         return SentenceGraph.loadCorpus(corpus, parse, tokenization)
     else:
         return corpus
Beispiel #13
0
                         "--analyses",
                         default="",
                         dest="analyses",
                         help="selected optional analyses")
    (options, args) = optparser.parse_args()

    if options.output != None:
        if os.path.exists(options.output):
            print >> sys.stderr, "Output directory exists, removing", options.output
            shutil.rmtree(options.output)
        os.makedirs(options.output)

    if options.analyses != "bionlp11":
        corpusElements = SentenceGraph.loadCorpus(
            options.input,
            options.parse,
            options.tokenization,
            removeIntersentenceInteractionsFromCorpusElements=False)
        print >> sys.stderr, "tokenization:", options.tokenization
        print >> sys.stderr, "parse:", options.parse

    #calculateMainStatistics(corpusElements)
    #analyzeLengths(corpusElements)
    #countMultipleEdges(corpusElements)
    if options.analyses.find("entities") != -1:
        listEntities(corpusElements)
    if options.analyses.find("structures") != -1:
        listStructures(corpusElements)
    if options.analyses.find("linear_distance") != -1:
        analyzeLinearDistance(corpusElements)
    if options.analyses.find("pos_counts") != -1:
Beispiel #14
0
 def loadCorpus(self, corpus, parse, tokenization):
     if type(corpus) == types.StringType or isinstance(corpus,ET.ElementTree): # corpus is in file
         return SentenceGraph.loadCorpus(corpus, parse, tokenization)
     else:
         return corpus
Beispiel #15
0
EDGE_CLASS_NAMES = "%s-edge-ids.class_names" % CONSTANT_CORPUS

EDGE_FEATURE_PARAMS = "style:typed,directed,no_linear,entities,noMasking,maxFeatures,genia_limits"

if False:
    ###############################################################################
    # Head token detection
    ###############################################################################

    # Find heads
    sys.path.append("..")
    import Core.SentenceGraph as SentenceGraph
    import cElementTreeUtils as ETUtils

    ETUtils.write(
        SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK).rootElement,
        TEST_FILE)
    ETUtils.write(
        SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK).rootElement,
        DEVEL_FILE)
    ETUtils.write(
        SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK).rootElement,
        TRAIN_FILE)
    ETUtils.write(
        SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE,
                                 TOK).rootElement, TRAIN_AND_DEVEL_FILE)

    ###############################################################################
    # Trigger example generation
    ###############################################################################
    print >> sys.stderr, "Trigger examples for parse", TOK
Beispiel #16
0
        print >> sys.stderr, "Psyco not installed"

    from optparse import OptionParser
    import os

    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    optparser.add_option("-i", "--input", default=None, dest="input", help="Input file (interaction XML)")
    optparser.add_option("-o", "--output", default=None, dest="output", help="Output file name")
    optparser.add_option("-e", "--test", default=None, dest="test", help="")
    optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse XML element name")
    optparser.add_option(
        "-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization XML element name"
    )
    (options, args) = optparser.parse_args()

    corpus = SentenceGraph.loadCorpus(options.input, options.parse, options.tokenization)
    gaz = NameGazetteer.build(corpus, options.output, options.parse, options.tokenization)

    if options.test != None:
        corpus = SentenceGraph.loadCorpus(options.test, options.parse, options.tokenization)
    for sentence in corpus.sentences:
        tokenSet = gaz.matchTokens(sentence.tokens, sentence.sentenceGraph.tokenIsName)
        string = ""
        for token in sentence.tokens:
            chain = False
            if token in tokenSet:
                chain = True
                if string != "":
                    string += "\t"
                string += token.get("text")
            elif chain:
Beispiel #17
0
    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    optparser.add_option("-i", "--input", default=defaultAnalysisFilename, dest="input", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="tokenization")
    optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="parse")
    optparser.add_option("-o", "--output", default=None, dest="output", help="output-folder")
    optparser.add_option("-a", "--analyses", default="", dest="analyses", help="selected optional analyses")
    (options, args) = optparser.parse_args()

    if options.output != None:
        if os.path.exists(options.output):
            print >> sys.stderr, "Output directory exists, removing", options.output
            shutil.rmtree(options.output)
        os.makedirs(options.output)
    
    if options.analyses != "bionlp11":
        corpusElements = SentenceGraph.loadCorpus(options.input, options.parse, options.tokenization, removeIntersentenceInteractionsFromCorpusElements=False)
        print >> sys.stderr, "tokenization:", options.tokenization
        print >> sys.stderr, "parse:", options.parse
    
    #calculateMainStatistics(corpusElements)
    #analyzeLengths(corpusElements)
    #countMultipleEdges(corpusElements)
    if options.analyses.find("entities") != -1:
        listEntities(corpusElements)
    if options.analyses.find("structures") != -1:
        listStructures(corpusElements)
    if options.analyses.find("linear_distance") != -1:
        analyzeLinearDistance(corpusElements)
    if options.analyses.find("pos_counts") != -1:
        countPOS(corpusElements)
    if options.analyses.find("pos_pair_counts") != -1: