def run(EvaluatorClass, inputCorpusFile, goldCorpusFile, parse, tokenization=None, target="both", entityMatchFunction=compareEntitiesSimple, removeIntersentenceInteractions=False, errorMatrix=False, verbose=False): print >> sys.stderr, "##### EvaluateInteractionXML #####" print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile # Class sets are used to convert the types to ids that the evaluator can use classSets = {} if EvaluatorClass.type == "binary": classSets["entity"] = IdSet(idDict={ "True": 1, "False": -1 }, locked=True) classSets["interaction"] = IdSet(idDict={ "True": 1, "False": -1 }, locked=True) negativeClassId = -1 elif EvaluatorClass.type == "multiclass": classSets["entity"] = IdSet(idDict={"neg": 1}, locked=False) classSets["interaction"] = IdSet(idDict={"neg": 1}, locked=False) negativeClassId = 1 else: sys.exit("Unknown evaluator type") # Load corpus and make sentence graphs goldCorpusElements = None if goldCorpusFile != None: goldCorpusElements = SentenceGraph.loadCorpus( goldCorpusFile, parse, tokenization, False, removeIntersentenceInteractions) predictedCorpusElements = SentenceGraph.loadCorpus( inputCorpusFile, parse, tokenization, False, removeIntersentenceInteractions) # Compare the corpora and print results on screen return processCorpora(EvaluatorClass, predictedCorpusElements, goldCorpusElements, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=errorMatrix, verbose=verbose)
def build(self, corpus, parse, tokenization=None): assert corpus != None if type(corpus) == types.StringType or isinstance(corpus, ET.ElementTree): # corpus is in file corpus = SentenceGraph.loadCorpus(corpus, parse, tokenization) for sentence in corpus.sentences: sentenceGraph = sentence.sentenceGraph if sentenceGraph == None: continue for t1 in sentenceGraph.tokens: for t2 in sentenceGraph.tokens: posTuple = ( t1.get("POS"), t2.get("POS") ) if not self.counts.has_key(posTuple): self.counts[posTuple] = {} if sentenceGraph.interactionGraph.has_edge(t1, t2): intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) for i in range(len(intEdges)): intElement = intEdges[i]["element"] intType = intElement.get("type") if not self.counts[posTuple].has_key(intType): self.counts[posTuple][intType] = 0 self.counts[posTuple][intType] += 1 else: if not self.counts[posTuple].has_key("neg"): self.counts[posTuple]["neg"] = 0 self.counts[posTuple]["neg"] += 1 self.update()
def run(cls, input, output, parse, tokenization, style, idFileTag=None): """ An interface for running the example builder without needing to create a class """ classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = IntersentenceEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) else: e = IntersentenceEdgeExampleBuilder(classSet=classSet, featureSet=featureSet) # Load documents if type(input) != types.ListType: # Load corpus and make sentence graphs corpusElements = SentenceGraph.loadCorpus(input, parse, tokenization, False, True) else: # assume input is already a list of sentences assert (removeNameInfo == False) return input # run examplebuilder e.buildExamplesForDocuments(corpusElements.documentSentences, output, idFileTag)
def build(self, corpus, parse, tokenization=None): assert corpus != None if type(corpus) == types.StringType or isinstance( corpus, ET.ElementTree): # corpus is in file corpus = SentenceGraph.loadCorpus(corpus, parse, tokenization) for sentence in corpus.sentences: sentenceGraph = sentence.sentenceGraph if sentenceGraph == None: continue for t1 in sentenceGraph.tokens: for t2 in sentenceGraph.tokens: posTuple = (t1.get("POS"), t2.get("POS")) if not self.counts.has_key(posTuple): self.counts[posTuple] = {} if sentenceGraph.interactionGraph.has_edge(t1, t2): intEdges = sentenceGraph.interactionGraph.get_edge_data( t1, t2, default={}) for i in range(len(intEdges)): intElement = intEdges[i]["element"] intType = intElement.get("type") if not self.counts[posTuple].has_key(intType): self.counts[posTuple][intType] = 0 self.counts[posTuple][intType] += 1 else: if not self.counts[posTuple].has_key("neg"): self.counts[posTuple]["neg"] = 0 self.counts[posTuple]["neg"] += 1 self.update()
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def run(EvaluatorClass, inputCorpusFile, goldCorpusFile, parse, tokenization=None, target="both", entityMatchFunction=compareEntitiesSimple, removeIntersentenceInteractions=False, errorMatrix=False, verbose=False): print >> sys.stderr, "##### EvaluateInteractionXML #####" print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile # Class sets are used to convert the types to ids that the evaluator can use classSets = {} if EvaluatorClass.type == "binary": classSets["entity"] = IdSet(idDict={"True":1,"False":-1}, locked=True) classSets["interaction"] = IdSet(idDict={"True":1,"False":-1}, locked=True) negativeClassId = -1 elif EvaluatorClass.type == "multiclass": classSets["entity"] = IdSet(idDict={"neg":1}, locked=False) classSets["interaction"] = IdSet(idDict={"neg":1}, locked=False) negativeClassId = 1 else: sys.exit("Unknown evaluator type") # Load corpus and make sentence graphs goldCorpusElements = None if goldCorpusFile != None: goldCorpusElements = SentenceGraph.loadCorpus(goldCorpusFile, parse, tokenization, False, removeIntersentenceInteractions) predictedCorpusElements = SentenceGraph.loadCorpus(inputCorpusFile, parse, tokenization, False, removeIntersentenceInteractions) # Compare the corpora and print results on screen return processCorpora(EvaluatorClass, predictedCorpusElements, goldCorpusElements, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=errorMatrix, verbose=verbose)
"The file to which the new XML structure is saved. If None, will be the same as target.", metavar="FILE") optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="Tokenization element name") optparser.add_option("-p", "--parse", default=None, dest="parse", help="Parse element name") (options, args) = optparser.parse_args() print >> sys.stderr, "Loading input file", options.input corpusElements = SentenceGraph.loadCorpus(options.input, options.parse, options.tokenization) counter = ProgressCounter(len(corpusElements.sentences), "Resolving chains") tags = ["e1", "e2"] for sentence in corpusElements.sentences: counter.update( 1, "Resolving chains for (" + sentence.sentence.attrib["id"] + "): ") identityChainDict = {} tokenHeadScores = sentence.sentenceGraph.getTokenHeadScores() for interaction in sentence.interactions: if interaction.attrib["type"] == "identity": e1 = sentence.entitiesById[interaction.attrib["e1"]] e2 = sentence.entitiesById[interaction.attrib["e2"]] t1 = sentence.sentenceGraph.entityHeadTokenByEntity[e1]
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator( input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update( len(sentences), "Finding heads (" + sentences[-1].sentence.get("id") + "): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def buildExamples(corpusDir, outPath): # define shortcuts for commonly used files PARSE = "stanford-newMC-intra" #"split-Charniak-Lease" TOK = "split-McClosky" CORPUS_DIR = corpusDir # xml files without heads BI_DEVEL_FILE = CORPUS_DIR + "/bioinfer.devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TEST_FILE = CORPUS_DIR + "/bioinfer.test.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TRAIN_FILE = CORPUS_DIR + "/bioinfer.train.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TRAIN_AND_DEVEL_FILE = CORPUS_DIR + "/bioinfer.train+devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" # xml files with head tokens TEST_FILE = outpath + "/bioinfer-test-" + PARSE + ".xml" DEVEL_FILE = outpath + "/bioinfer-devel-" + PARSE + ".xml" TRAIN_FILE = outpath + "/bioinfer-train-" + PARSE + ".xml" TRAIN_AND_DEVEL_FILE = outpath + "/bioinfer-train-and-devel-" + PARSE + ".xml" WORKDIR = outpath # Find heads sys.path.append("..") import Core.SentenceGraph as SentenceGraph import cElementTreeUtils as ETUtils if not os.path.exists(TEST_FILE): c = SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TEST_FILE) if not os.path.exists(DEVEL_FILE): c = SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK) ETUtils.write(c.rootElement, DEVEL_FILE) if not os.path.exists(TRAIN_FILE): c = SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TRAIN_FILE) if not os.path.exists(TRAIN_AND_DEVEL_FILE): c = SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TRAIN_AND_DEVEL_FILE) ############################################################################### # Trigger example generation ############################################################################### print >> sys.stderr, "Trigger examples for parse", TOK if not os.path.exists("gazetteer-train-" + TOK): Gazetteer.run(TRAIN_FILE, "gazetteer-train-" + TOK, TOK) if not os.path.exists("gazetteer-train-and-devel-" + TOK): Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-" + TOK, TOK) # generate the files for the old charniak if not os.path.exists("trigger-train-examples-" + PARSE): GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE, "trigger-train-examples-" + PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-" + TOK) if not os.path.exists("trigger-devel-examples-" + PARSE): GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE, "trigger-devel-examples-" + PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-" + TOK) if not os.path.exists("trigger-train-and-devel-examples-" + PARSE): GeneralEntityTypeRecognizerGztr.run( TRAIN_AND_DEVEL_FILE, "trigger-train-and-devel-examples-" + PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-" + TOK) if not os.path.exists("trigger-test-examples-" + PARSE): GeneralEntityTypeRecognizerGztr.run(TEST_FILE, "trigger-test-examples-" + PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-" + TOK) ############################################################################### # Edge example generation ############################################################################### print >> sys.stderr, "Edge examples for parse", PARSE EDGE_FEATURE_PARAMS = "style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits" if not os.path.exists("edge-train-examples-" + PARSE): MultiEdgeExampleBuilder.run(TRAIN_FILE, "edge-train-examples-" + PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") if not os.path.exists("edge-devel-examples-" + PARSE): MultiEdgeExampleBuilder.run(DEVEL_FILE, "edge-devel-examples-" + PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") if not os.path.exists("edge-train-and-devel-examples-" + PARSE): MultiEdgeExampleBuilder.run(TRAIN_AND_DEVEL_FILE, "edge-train-and-devel-examples-" + PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") # NOTE! These TEST examples will be based on gold standard triggers! if not os.path.exists("edge-test-examples-" + PARSE): MultiEdgeExampleBuilder.run(TEST_FILE, "edge-test-examples-" + PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids")
EDGE_TEST_EXAMPLE_FILE="edge-test-examples-"+PARSE EDGE_CLASS_NAMES="bioinfer-edge-ids.class_names" EDGE_FEATURE_PARAMS="style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits" if True: ############################################################################### # Head token detection ############################################################################### # Find heads sys.path.append("..") import Core.SentenceGraph as SentenceGraph import cElementTreeUtils as ETUtils ETUtils.write(SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK).rootElement, TEST_FILE) ETUtils.write(SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK).rootElement, DEVEL_FILE) ETUtils.write(SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK).rootElement, TRAIN_FILE) ETUtils.write(SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK).rootElement, TRAIN_AND_DEVEL_FILE) ############################################################################### # Trigger example generation ############################################################################### print >> sys.stderr, "Trigger examples for parse", PARSE Gazetteer.run(TRAIN_FILE, "gazetteer-train-"+TOK, TOK) Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-"+TOK, TOK) # Generate example files GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE, TRIGGER_TRAIN_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK) GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE, TRIGGER_DEVEL_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK) GeneralEntityTypeRecognizerGztr.run(TRAIN_AND_DEVEL_FILE, TRIGGER_TRAIN_AND_DEVEL_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK) GeneralEntityTypeRecognizerGztr.run(TEST_FILE, TRIGGER_TEST_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK)
def buildExamples(corpusDir, outPath): # define shortcuts for commonly used files PARSE="stanford-newMC-intra" #"split-Charniak-Lease" TOK="split-McClosky" CORPUS_DIR=corpusDir # xml files without heads BI_DEVEL_FILE=CORPUS_DIR+"/bioinfer.devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TEST_FILE=CORPUS_DIR+"/bioinfer.test.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TRAIN_FILE=CORPUS_DIR+"/bioinfer.train.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TRAIN_AND_DEVEL_FILE=CORPUS_DIR+"/bioinfer.train+devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" # xml files with head tokens TEST_FILE=outpath+"/bioinfer-test-"+PARSE+".xml" DEVEL_FILE=outpath+"/bioinfer-devel-"+PARSE+".xml" TRAIN_FILE=outpath+"/bioinfer-train-"+PARSE+".xml" TRAIN_AND_DEVEL_FILE=outpath+"/bioinfer-train-and-devel-"+PARSE+".xml" WORKDIR=outpath # Find heads sys.path.append("..") import Core.SentenceGraph as SentenceGraph import cElementTreeUtils as ETUtils if not os.path.exists(TEST_FILE): c = SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TEST_FILE) if not os.path.exists(DEVEL_FILE): c = SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK) ETUtils.write(c.rootElement, DEVEL_FILE) if not os.path.exists(TRAIN_FILE): c = SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TRAIN_FILE) if not os.path.exists(TRAIN_AND_DEVEL_FILE): c = SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TRAIN_AND_DEVEL_FILE) ############################################################################### # Trigger example generation ############################################################################### print >> sys.stderr, "Trigger examples for parse", TOK if not os.path.exists("gazetteer-train-"+TOK): Gazetteer.run(TRAIN_FILE, "gazetteer-train-"+TOK, TOK) if not os.path.exists("gazetteer-train-and-devel-"+TOK): Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-"+TOK, TOK) # generate the files for the old charniak if not os.path.exists("trigger-train-examples-"+PARSE): GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE, "trigger-train-examples-"+PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK) if not os.path.exists("trigger-devel-examples-"+PARSE): GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE, "trigger-devel-examples-"+PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK) if not os.path.exists("trigger-train-and-devel-examples-"+PARSE): GeneralEntityTypeRecognizerGztr.run(TRAIN_AND_DEVEL_FILE, "trigger-train-and-devel-examples-"+PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK) if not os.path.exists("trigger-test-examples-"+PARSE): GeneralEntityTypeRecognizerGztr.run(TEST_FILE, "trigger-test-examples-"+PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK) ############################################################################### # Edge example generation ############################################################################### print >> sys.stderr, "Edge examples for parse", PARSE EDGE_FEATURE_PARAMS="style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits" if not os.path.exists("edge-train-examples-"+PARSE): MultiEdgeExampleBuilder.run(TRAIN_FILE, "edge-train-examples-"+PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") if not os.path.exists("edge-devel-examples-"+PARSE): MultiEdgeExampleBuilder.run(DEVEL_FILE, "edge-devel-examples-"+PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") if not os.path.exists("edge-train-and-devel-examples-"+PARSE): MultiEdgeExampleBuilder.run(TRAIN_AND_DEVEL_FILE, "edge-train-and-devel-examples-"+PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") # NOTE! These TEST examples will be based on gold standard triggers! if not os.path.exists("edge-test-examples-"+PARSE): MultiEdgeExampleBuilder.run(TEST_FILE, "edge-test-examples-"+PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids")
def loadCorpus(self, corpus, parse, tokenization): if type(corpus) == types.StringType or isinstance( corpus, ET.ElementTree): # corpus is in file return SentenceGraph.loadCorpus(corpus, parse, tokenization) else: return corpus
"--analyses", default="", dest="analyses", help="selected optional analyses") (options, args) = optparser.parse_args() if options.output != None: if os.path.exists(options.output): print >> sys.stderr, "Output directory exists, removing", options.output shutil.rmtree(options.output) os.makedirs(options.output) if options.analyses != "bionlp11": corpusElements = SentenceGraph.loadCorpus( options.input, options.parse, options.tokenization, removeIntersentenceInteractionsFromCorpusElements=False) print >> sys.stderr, "tokenization:", options.tokenization print >> sys.stderr, "parse:", options.parse #calculateMainStatistics(corpusElements) #analyzeLengths(corpusElements) #countMultipleEdges(corpusElements) if options.analyses.find("entities") != -1: listEntities(corpusElements) if options.analyses.find("structures") != -1: listStructures(corpusElements) if options.analyses.find("linear_distance") != -1: analyzeLinearDistance(corpusElements) if options.analyses.find("pos_counts") != -1:
def loadCorpus(self, corpus, parse, tokenization): if type(corpus) == types.StringType or isinstance(corpus,ET.ElementTree): # corpus is in file return SentenceGraph.loadCorpus(corpus, parse, tokenization) else: return corpus
EDGE_CLASS_NAMES = "%s-edge-ids.class_names" % CONSTANT_CORPUS EDGE_FEATURE_PARAMS = "style:typed,directed,no_linear,entities,noMasking,maxFeatures,genia_limits" if False: ############################################################################### # Head token detection ############################################################################### # Find heads sys.path.append("..") import Core.SentenceGraph as SentenceGraph import cElementTreeUtils as ETUtils ETUtils.write( SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK).rootElement, TEST_FILE) ETUtils.write( SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK).rootElement, DEVEL_FILE) ETUtils.write( SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK).rootElement, TRAIN_FILE) ETUtils.write( SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK).rootElement, TRAIN_AND_DEVEL_FILE) ############################################################################### # Trigger example generation ############################################################################### print >> sys.stderr, "Trigger examples for parse", TOK
print >> sys.stderr, "Psyco not installed" from optparse import OptionParser import os optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--input", default=None, dest="input", help="Input file (interaction XML)") optparser.add_option("-o", "--output", default=None, dest="output", help="Output file name") optparser.add_option("-e", "--test", default=None, dest="test", help="") optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse XML element name") optparser.add_option( "-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization XML element name" ) (options, args) = optparser.parse_args() corpus = SentenceGraph.loadCorpus(options.input, options.parse, options.tokenization) gaz = NameGazetteer.build(corpus, options.output, options.parse, options.tokenization) if options.test != None: corpus = SentenceGraph.loadCorpus(options.test, options.parse, options.tokenization) for sentence in corpus.sentences: tokenSet = gaz.matchTokens(sentence.tokens, sentence.sentenceGraph.tokenIsName) string = "" for token in sentence.tokens: chain = False if token in tokenSet: chain = True if string != "": string += "\t" string += token.get("text") elif chain:
optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--input", default=defaultAnalysisFilename, dest="input", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="tokenization") optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="parse") optparser.add_option("-o", "--output", default=None, dest="output", help="output-folder") optparser.add_option("-a", "--analyses", default="", dest="analyses", help="selected optional analyses") (options, args) = optparser.parse_args() if options.output != None: if os.path.exists(options.output): print >> sys.stderr, "Output directory exists, removing", options.output shutil.rmtree(options.output) os.makedirs(options.output) if options.analyses != "bionlp11": corpusElements = SentenceGraph.loadCorpus(options.input, options.parse, options.tokenization, removeIntersentenceInteractionsFromCorpusElements=False) print >> sys.stderr, "tokenization:", options.tokenization print >> sys.stderr, "parse:", options.parse #calculateMainStatistics(corpusElements) #analyzeLengths(corpusElements) #countMultipleEdges(corpusElements) if options.analyses.find("entities") != -1: listEntities(corpusElements) if options.analyses.find("structures") != -1: listStructures(corpusElements) if options.analyses.find("linear_distance") != -1: analyzeLinearDistance(corpusElements) if options.analyses.find("pos_counts") != -1: countPOS(corpusElements) if options.analyses.find("pos_pair_counts") != -1: