def run(EvaluatorClass, inputCorpusFile, goldCorpusFile, parse, tokenization=None, target="both", entityMatchFunction=compareEntitiesSimple, removeIntersentenceInteractions=False): print >> sys.stderr, "##### EvaluateInteractionXML #####" print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile # Class sets are used to convert the types to ids that the evaluator can use classSets = {} if EvaluatorClass.type == "binary": classSets["entity"] = IdSet(idDict={ "True": 1, "False": -1 }, locked=True) classSets["interaction"] = IdSet(idDict={ "True": 1, "False": -1 }, locked=True) negativeClassId = -1 elif EvaluatorClass.type == "multiclass": classSets["entity"] = IdSet(idDict={"neg": 1}, locked=False) classSets["interaction"] = IdSet(idDict={"neg": 1}, locked=False) negativeClassId = 1 else: sys.exit("Unknown evaluator type") # Load corpus and make sentence graphs goldCorpusElements = None if goldCorpusFile != None: goldCorpusElements = CorpusElements.loadCorpus( goldCorpusFile, parse, tokenization, removeIntersentenceInteractions) predictedCorpusElements = CorpusElements.loadCorpus( inputCorpusFile, parse, tokenization, removeIntersentenceInteractions) # Compare the corpora and print results on screen return processCorpora(EvaluatorClass, predictedCorpusElements, goldCorpusElements, target, classSets, negativeClassId, entityMatchFunction)
def fromXML(self, input, parse, tokenization=None): self.names = {} if type(input) == types.StringType: corpus = CorpusElements.loadCorpus(input, parse, tokenization) else: corpus = input for sentence in corpus.sentences: tokenTuples = self.prepareTokens(sentence.tokens) for entity in sentence.entities: if entity.get("isName") == "True": tokens = self.getTokens(entity, tokenTuples) assert len(tokens) > 0 self.addName(tokens) self.addName(["".join(tokens)])
def gifxmlToGenia(input, output, task=1, outputIsA2File=False, submission=False, verbose=True, strengths=False): assert (task == 1 or task == 2 or task == 3) outputTarFilename = None # Make or clear output directory if verbose: print >> sys.stderr, "Writing shared task files", if not outputIsA2File: if output.find("tar.gz") != -1: outputTarFilename = output output = "temp-genia-format" if os.path.exists(output): if verbose: print >> sys.stderr, "over existing directory", output shutil.rmtree(output) else: if verbose: print >> sys.stderr, "to directory", output os.mkdir(output) # Convert the gifxml to the genia format files inputCorpus = CorpusElements.loadCorpus( input, removeIntersentenceInteractions=False) processCorpus(inputCorpus, output, task, outputIsA2File, verbose=verbose, strengths=strengths) if submission: if not outputIsA2File: makeSubmissionFile(options.output, output.split("/")[-1] + ".tar.gz") else: print >> sys.stderr, "Warning: Single a2-file output, no submission package created" if outputTarFilename != None: print >> sys.stderr, "Compressing output to", outputTarFilename outputTarFile = tarfile.open(outputTarFilename, "w:gz") allFiles = os.listdir(output) tempCwd = os.getcwd() os.chdir(output) for file in allFiles: outputTarFile.add(file) os.chdir(tempCwd) outputTarFile.close
def fromXML(self, input, parse, tokenization=None): self.names = {} if type(input) == types.StringType: corpus = CorpusElements.loadCorpus(input, parse, tokenization) else: corpus = input for sentence in corpus.sentences: tokenTuples = self.prepareTokens(sentence.tokens) for entity in sentence.entities: if entity.get("isName") == "True": tokens = self.getTokens(entity, tokenTuples) assert len(tokens) > 0 self.addName(tokens) self.addName(["".join(tokens)])
def run(EvaluatorClass, inputCorpusFile, goldCorpusFile, parse, tokenization=None, target="both", entityMatchFunction=compareEntitiesSimple, removeIntersentenceInteractions=False): print >> sys.stderr, "##### EvaluateInteractionXML #####" print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile # Class sets are used to convert the types to ids that the evaluator can use classSets = {} if EvaluatorClass.type == "binary": classSets["entity"] = IdSet(idDict={"True":1,"False":-1}, locked=True) classSets["interaction"] = IdSet(idDict={"True":1,"False":-1}, locked=True) negativeClassId = -1 elif EvaluatorClass.type == "multiclass": classSets["entity"] = IdSet(idDict={"neg":1}, locked=False) classSets["interaction"] = IdSet(idDict={"neg":1}, locked=False) negativeClassId = 1 else: sys.exit("Unknown evaluator type") # Load corpus and make sentence graphs goldCorpusElements = None if goldCorpusFile != None: goldCorpusElements = CorpusElements.loadCorpus(goldCorpusFile, parse, tokenization, removeIntersentenceInteractions) predictedCorpusElements = CorpusElements.loadCorpus(inputCorpusFile, parse, tokenization, removeIntersentenceInteractions) # Compare the corpora and print results on screen return processCorpora(EvaluatorClass, predictedCorpusElements, goldCorpusElements, target, classSets, negativeClassId, entityMatchFunction)
def gifxmlToGenia(input, output, task=1, outputIsA2File=False, submission=False, verbose=True, strengths=False): assert(task == 1 or task == 2 or task == 3) outputTarFilename = None # Make or clear output directory if verbose: print >> sys.stderr, "Writing shared task files", if not outputIsA2File: if output.find("tar.gz") != -1: outputTarFilename = output output = "temp-genia-format" if os.path.exists(output): if verbose: print >> sys.stderr, "over existing directory", output shutil.rmtree(output) else: if verbose: print >> sys.stderr, "to directory", output os.mkdir(output) # Convert the gifxml to the genia format files inputCorpus = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False) processCorpus(inputCorpus, output, task, outputIsA2File, verbose=verbose, strengths=strengths) if submission: if not outputIsA2File: makeSubmissionFile(options.output, output.split("/")[-1] + ".tar.gz") else: print >> sys.stderr, "Warning: Single a2-file output, no submission package created" if outputTarFilename != None: print >> sys.stderr, "Compressing output to", outputTarFilename outputTarFile = tarfile.open(outputTarFilename, "w:gz") allFiles = os.listdir(output) tempCwd = os.getcwd() os.chdir(output) for file in allFiles: outputTarFile.add(file) os.chdir(tempCwd) outputTarFile.close
metavar="FILE") optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory") optparser.add_option("-f", "--folds", type="int", default=10, dest="folds", help="X-fold cross validation") (options, args) = optparser.parse_args() # Load corpus and make sentence graphs corpusElements = CorpusElements.loadCorpus(options.input) outputTrees = [] for i in range(options.folds): newRoot = ET.Element("corpus") for key in corpusElements.rootElement.attrib.keys(): newRoot.attrib[key] = corpusElements.rootElement.attrib[key] outputTrees.append(newRoot) print >> sys.stderr, "Reading document ids" documentIds = [] for document in corpusElements.documents: docId = document.attrib["id"] assert (not docId in documentIds) documentIds.append(docId)