def catenate(input1, input2, output): print >> sys.stderr, "##### Catenate interaction XML #####" c1 = RecalculateIds.recalculateIds(input1, None, False, 0) numDocs = len(c1.getroot().findall("document")) print >> sys.stderr, "Documents in input 1:", numDocs c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs) print >> sys.stderr, "Appending documents" c1Root = c1.getroot() for document in c2.getroot().findall("document"): c1Root.append(document) print >> sys.stderr, "Validating ids" ids = set() for element in c1Root.getiterator("entity"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("interaction"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("sentence"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("document"): id = element.get("id") assert not id in ids ids.add(id) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(c1Root, output) return c1
def removeUnconnectedEntities(input, output=None): input = ETUtils.ETFromObj(input) root = input.getroot() removed = 0 preserved = 0 for document in root.findall("document"): sentMap = {} # allow for intersentence interactions for sentence in document.findall("sentence"): sentMap[sentence.get("id")] = sentence connected = set() for interaction in document.getiterator("interaction"): connected.add(interaction.get("e1")) connected.add(interaction.get("e2")) entities = [] for entity in document.getiterator("entity"): entities.append(entity) for entity in entities: if entity.get("isName") == "True": # never remove named entities continue eId = entity.get("id") if eId not in connected: if eId.find(".s") != -1: # sentence level entity sentMap[eId.rsplit(".", 1)[0]].remove(entity) else: # document level entity document.remove(entity) removed += 1 else: preserved += 1 print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(root, output) return input
def mergeAll(input, output=None, debug=False, iterate=False): if iterate: origItems = defaultdict(int) removedItems = defaultdict(int) for docSentences in SentenceElements.getCorpusIterator(input, output): entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities( docSentences, debug) for key in entitiesByType: origItems[key] += entitiesByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions( docSentences, debug) for key in interactionsByType: origItems[key] += interactionsByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] printStats(origItems, removedItems) return None else: corpusElements = CorpusElements.loadCorpus( input, removeIntersentenceInteractions=False) print >> sys.stderr, "Merging duplicate entities" entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities( corpusElements.sentences, debug) printStats(entitiesByType, duplicatesRemovedByType) print >> sys.stderr, "Merging duplicate interactions" interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions( corpusElements.sentences, debug) printStats(interactionsByType, duplicatesRemovedByType) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
def convert(input, output=None, outputRoot=None): print >> sys.stderr, "##### Convert PMC to Interaction XML #####" print >> sys.stderr, "Loading corpus", input pmcTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" pmcRoot = pmcTree.getroot() includeElements = [ "front", "article-meta", "title-group", "article-title", "abstract", "body", "sec", "p", "title"] collapseElements = [ "front", "article-meta", "title-group", "p"] if outputRoot == None: outputRoot = ET.Element("corpus") outputRoot.set("source", "PMC") outputRoot.append(addElements(pmcRoot, includeElements, collapseElements)) outputTree = ET.ElementTree(outputRoot) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(outputTree, output) return outputTree
def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False): """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf) multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence the root of the modified tree is returned and, if outFile is a string, written out to outFile as well""" print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####" tree=ETUtils.ETFromObj(inFile) if not ET.iselement(tree): assert isinstance(tree,ET.ElementTree) root=tree.getroot() else: root = tree if multiplier != -1: if binary: print >> sys.stderr, "Recall binary mode" classRanges = getClassRanges(root.getiterator("entity")) assert len(classRanges.keys()) in [0,2] if len(classRanges.keys()) == 0: print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found" else: print >> sys.stderr, "Recall multiclass mode" classRanges = None for entityNode in root.getiterator("entity"): adjustEntity(entityNode,targetLabel,multiplier,classRanges) if outFile: ETUtils.write(root,outFile) return tree
def convert(input, output=None, outputRoot=None): print >> sys.stderr, "##### Convert PMC to Interaction XML #####" print >> sys.stderr, "Loading corpus", input pmcTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" pmcRoot = pmcTree.getroot() includeElements = [ "front", "article-meta", "title-group", "article-title", "abstract", "body", "sec", "p", "title" ] collapseElements = ["front", "article-meta", "title-group", "p"] if outputRoot == None: outputRoot = ET.Element("corpus") outputRoot.set("source", "PMC") outputRoot.append(addElements(pmcRoot, includeElements, collapseElements)) outputTree = ET.ElementTree(outputRoot) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(outputTree, output) return outputTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Loading corpus file", inputFilename if inputFilename.rsplit(".",1)[-1] == "gz": import gzip corpusTree = ET.parse(gzip.open(inputFilename)) else: corpusTree = ET.parse(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} for k in sorted(rules.keys()): countsByType[k] = 0 for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def extractTask2(inputfile, outputfile, inverse): if inverse: print >> sys.stderr, "Extracting task2 information from", inputfile, "to", outputfile else: print >> sys.stderr, "Removing task2 information from", inputfile, "and saving to", outputfile corpusRoot = getCorpus(inputfile) for sentence in corpusRoot.getiterator("sentence"): task2EntityIds = set() if not inverse: for entity in sentence.findall("entity"): if entity.get("type") == "Entity": task2EntityIds.add(entity.get("id")) if entity.get("type") in ["Entity", "neg"]: sentence.remove(entity) for interaction in sentence.findall("interaction"): if interaction.get("type") in ["Site","CSite","AtLoc","ToLoc","neg"]: sentence.remove(interaction) elif interaction.get("e1") in task2EntityIds or interaction.get("e2") in task2EntityIds: sentence.remove(interaction) # remove Theme/Cause interactions referring to t2 entities else: for entity in sentence.findall("entity"): if entity.get("type") == "Entity": task2EntityIds.add(entity.get("id")) if entity.get("type") != "Entity": sentence.remove(entity) for interaction in sentence.findall("interaction"): if interaction.get("type") not in ["Site","CSite","AtLoc","ToLoc"]: sentence.remove(interaction) analysesElement = sentence.find("sentenceanalyses") if analysesElement != None: sentence.remove(analysesElement) ETUtils.write(corpusRoot, outputfile)
def makeSubset(filename, output, ratio, seed): if ratio == 1.0: return filename totalFolds = 100 selectedFolds = int(ratio * 100.0) print >> sys.stderr, "====== Making subset ======" print >> sys.stderr, "Subset for file", filename, "ratio", ratio, "seed", seed import cElementTreeUtils as ETUtils import Core.Split xml = ETUtils.ETFromObj(filename).getroot() count = 0 sentCount = 0 for document in xml.findall("document"): sentCount += len(document.findall("sentence")) count += 1 division = Core.Split.getFolds(count, totalFolds, seed) #print division, selectedFolds - 1 index = 0 removeCount = 0 sentRemoveCount = 0 for document in xml.findall("document"): if division[index] > selectedFolds - 1: xml.remove(document) sentRemoveCount += len(document.findall("sentence")) removeCount += 1 index += 1 print "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount ETUtils.write(xml, output) return output
def mergeAll(input, output=None, debug=False): corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False) mergeDuplicateEntities(corpusElements, debug) mergeDuplicateInteractions(corpusElements, debug) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
def writeTask3ToInteractionXML(examples, predictions, corpusElements, outputFileName, task3Type): import sys print >> sys.stderr, "Adding task 3 to Interaction XML" try: import xml.etree.cElementTree as ET except ImportError: import cElementTree as ET import cElementTreeUtils as ETUtils assert task3Type == "speculation" or task3Type == "negation" if type(predictions) == types.StringType: print >> sys.stderr, "Reading predictions from", predictions predictions = loadPredictions(predictions) if type(examples) == types.StringType: print >> sys.stderr, "Reading examples from", examples examples = readExamples(examples, False) corpusTree = ETUtils.ETFromObj(corpusElements) corpusRoot = corpusTree.getroot() # Remove the task 3 subtask information if it already exists for entity in corpusRoot.getiterator("entity"): if task3Type == "speculation": entity.set("speculation", "False") else: # task3Type == "negation" entity.set("negation", "False") specMap = {} negMap = {} for example, prediction in itertools.izip(examples, predictions): assert example[3]["xtype"] == "task3" if example[3]["t3type"] == "speculation": map = specMap else: map = negMap if prediction[0] != 1: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = True for entity in corpusRoot.getiterator("entity"): if task3Type == "speculation": if specMap.has_key(entity.get("id")): entity.set("speculation", "True") else: entity.set("speculation", "False") elif task3Type == "negation": if negMap.has_key(entity.get("id")): entity.set("negation", "True") else: entity.set("negation", "False") # Write corpus if outputFileName != None: print >> sys.stderr, "Writing corpus to", outputFileName ETUtils.write(corpusRoot, outputFileName) return corpusTree
def addMTMX(input, mtmxDir, output): from collections import defaultdict # read interaction XML print "Reading interaction XML" counts = defaultdict(int) xml = ETUtils.ETFromObj(input).getroot() docById = {} for document in xml.getiterator("document"): docId = document.get("origId") assert docId not in docById docById[docId] = document counts["document"] += 1 for entity in xml.getiterator("entity"): counts["entity"] += 1 # read MTMX files print "Processing MTMX" for filename in sorted(os.listdir(mtmxDir)): if filename.endswith(".xml"): print filename, fileId = filename.split("_")[0] if fileId not in docById: print "skipped" continue else: print "processing" doc = docById[fileId] entityByOrigId = {} for entity in doc.getiterator("entity"): assert entity.get("origId") not in entityByOrigId, entity.get("origId") entityByOrigId[entity.get("origId")] = entity mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot() for phrase in mtmx.getiterator("PHRASE"): if phrase.get("ID") in entityByOrigId: entity = entityByOrigId[phrase.get("ID")] mapCount = 0 for map in phrase.getiterator("MAP"): if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()): if entity.get("mtmxProb") != None: if int(entity.get("mtmxProb")) > int(map.get("PROB")): break else: counts["mapped-multi"] += 1 counts["mapped-multi-"+str(mapCount)] += 1 #print filename, phrase.get("ID") else: counts["mapped-at-least-once"] += 1 entity.set("mtmxProb", str(map.get("PROB"))) entity.set("mtmxCui", str(map.get("CUI"))) entity.set("mtmxName", str(map.get("NAME"))) entity.set("mtmxNameShort", str(map.get("NAME_SHORT"))) entity.set("mtmxSemTypes", str(map.get("SEMTYPES"))) counts["mappings"] += 1 mapCount += 1 print counts ETUtils.write(xml, output)
def processCorpus(input, outDir, stem, tail, mergedSets=[], saveCombined=False, verbose=False): newCorpora = {} print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} for document in documents: counter.update() docSet = document.get("set") if docSet == None: if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id") if not countsByType.has_key("No set"): countsByType["No set"] = 0 countsByType["No set"] += 1 continue elif not newCorpora.has_key(docSet): newCorpora[docSet] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[docSet].set(k, v) countsByType[docSet] = 0 newCorpora[docSet].append(document) countsByType[docSet] += 1 # Make merged sets for mergedSet in mergedSets: tag = "-and-".join(sorted(mergedSet)) if not newCorpora.has_key(tag): newCorpora[tag] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[tag].set(k, v) countsByType[tag] = 0 for componentSet in mergedSet: for element in newCorpora[componentSet].findall("document"): newCorpora[tag].append(element) countsByType[tag] += 1 print >> sys.stderr, "Documents per set" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + str(k) + ":", countsByType[k] if not os.path.exists(outDir): os.makedirs(outDir) print >> sys.stderr, "Writing output files to directory", outDir if saveCombined: print >> sys.stderr, "Saving combined input to", stem + tail ETUtils.write(corpusRoot, stem + tail) else: print >> sys.stderr, "Combined input not saved" for docSet in sorted(newCorpora.keys()): outFilename = os.path.join(outDir, stem + "-" + docSet + tail) print >> sys.stderr, "Writing set", docSet, "to", outFilename ETUtils.write(newCorpora[docSet], outFilename)
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def makeConfigXML(workdir, bannerDir, oldVersion=True): conf = ET.Element("banner-configuration") banner = ET.SubElement(conf, "banner") eval = ET.SubElement(banner, "eval") datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset" # Dataset dataset = ET.SubElement(eval, "dataset") ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt" ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval" ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval" codecs.open(os.path.join(workdir, "empty.eval"), "wt", "utf-8").close() # More eval level stuff ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt" ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt" ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt" ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt" codecs.open(os.path.join(workdir, "output.txt"), "wt", "utf-8").close() ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html" ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt" ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin" ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser" ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger" ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger" ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer" ET.SubElement(eval, "useParenthesisPostProcessing").text = "true" ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true" ET.SubElement(eval, "useNumericNormalization").text = "true" ET.SubElement(eval, "tagFormat").text = "IOB" ET.SubElement(eval, "crfOrder").text = "2" if not oldVersion: ET.SubElement(eval, "mentionTypes").text = "Required" ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception" ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception" ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger" # End eval element tagging = ET.SubElement(banner, "tagging") dictionary = ET.SubElement(tagging, "dictionary") dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger") ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true" ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false" ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false" ET.SubElement(dictionaryTagger, "canonize").text = "false" ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true" ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false" ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt" ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE" # Write to file filename = workdir + "/banner_config.xml" ETUtils.write(conf, workdir + "/banner_config.xml") return workdir + "/banner_config.xml"
def negateEvents(input, output=None, verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: for entity in sentence.findall("entity"): counts["all-entities"] += 1 eType = entity.get("type") if not isNegatableEPITrigger(eType): counts["out-of-scope"] += 1 continue eBaseType = getEPIBaseType(eType) eText = entity.get("text").lower() eNewType = determineNewType(eType, eText) # Insert changed charOffset counts["entities"] += 1 if verbose: print "Entity", entity.get("id"), [entity.get("text")], [ eType, eBaseType, eNewType ], if eNewType != eBaseType: counts["negated"] += 1 if verbose: print "NEGATED", if eNewType == eType: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 if eNewType == eBaseType: counts["incorrect-pos"] += 1 else: counts["incorrect-neg"] += 1 if verbose: print "INCORRECT" entity.set("type", eNewType) if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def interface(optionArgs=sys.argv[1:]): """ The function to handle the command-line interface. """ from optparse import OptionParser op = OptionParser(usage="%prog [options]\nGenia shared task specific pruning of invalid nodes and edges.") op.add_option("-i", "--infile", dest="infile", help="Input file (gifxml)", metavar="FILE") op.add_option("-o", "--outfile", dest="outfile", help="Output file (gifxml)", metavar="FILE") op.add_option("-c", "--cycles", dest="cycles", help="Remove cycles (requires the presence of 'predictions' attribute in 'interaction' elements)", default=False, action="store_true") (options, args) = op.parse_args(optionArgs) quit = False if not options.infile: print "Please specify the input file." quit = True # if not options.outfile: # print "Please specify the output file." # quit = True if quit: op.print_help() return(False) corpus = ETUtils.ETFromObj(options.infile) cycleBrokenCount = 0 skipCount = 0 for document in corpus.getroot().findall('document'): for sentence in document.findall("sentence"): #sys.stderr.write("Pruning document %s\n"%document.attrib['id']) pruner = Pruner(sentence) pruner.analyse() if options.cycles: cycleBrokenCount += pruner.analyseCycles() pruner.prune() sys.stderr.write("File pruned, broke " + str(cycleBrokenCount) + " cycles\n" ) if skipCount > 0: sys.stderr.write("Pruning skipped " + str(skipCount) + " sentences\n" ) if options.outfile: ETUtils.write(corpus, options.outfile) return corpus
def negateEvents(input, output=None, verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: for entity in sentence.findall("entity"): counts["all-entities"] += 1 eType = entity.get("type") if not isNegatableEPITrigger(eType): counts["out-of-scope"] += 1 continue eBaseType = getEPIBaseType(eType) eText = entity.get("text").lower() eNewType = determineNewType(eType, eText) # Insert changed charOffset counts["entities"] += 1 if verbose: print "Entity", entity.get("id"), [entity.get("text")], [eType, eBaseType, eNewType], if eNewType != eBaseType: counts["negated"] += 1 if verbose: print "NEGATED", if eNewType == eType: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 if eNewType == eBaseType: counts["incorrect-pos"] += 1 else: counts["incorrect-neg"] += 1 if verbose: print "INCORRECT" entity.set("type", eNewType) if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def insertTask2(inputfile, task2file, outputfile): print >> sys.stderr, "Adding task2 information from", task2file, "to", inputfile, "and saving to", outputfile t2Root = getCorpus(task2file) noT2Root = getCorpus(inputfile) sentMap = {} for sentence in t2Root.getiterator("sentence"): sentMap[sentence.get("id")] = sentence for sentence in noT2Root.getiterator("sentence"): for entity in sentMap[sentence.get("id")].findall("entity"): sentence.append(entity) for interaction in sentMap[sentence.get("id")].findall("interaction"): sentence.append(interaction) ETUtils.write(noT2Root, outputfile)
def convert(srFiles, xmlFileName, outdir, corpusName, idByNorText=False): print >> sys.stderr, "Loading Static Relations" events = {} for srFile in srFiles: readEventsFromSR(srFile[0], srFile[1], events, idByNorText=idByNorText) if xmlFileName != None: xmlEvents = {} dataSets = {} srTexts = {} # original, unnormalized sentence texts from the SR corpus eventsToXML(events, xmlEvents, dataSets, srTexts) print >> sys.stderr, "Loading XML" xml = ETUtils.ETFromObj(xmlFileName) print >> sys.stderr, "Inserting XML events" insertEvents(xmlEvents, dataSets, srTexts, xml, corpusName) ETUtils.write(xml, outdir+corpusName+"-srevents.xml") # update pre-existing parses print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-mccc-preparsed", tokenization=None, output=outdir+corpusName+"-heads.xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, corpusName + "-", ".xml", [("devel", "train")]) print >> sys.stderr, "Converting back" STConvert.toSTFormat(outdir+corpusName + "-devel.xml", outDir + corpusName + "-stformat-devel", outputTag="rel", task=2, debug=True, validate=False) STConvert.toSTFormat(outdir+corpusName + "-train.xml", outDir + corpusName + "-stformat-train", outputTag="rel", task=2, debug=True, validate=False) else: xml = eventsToNewXML(events) xmlTree = ET.ElementTree(xml) ETUtils.write(xml, outdir+corpusName+"-srevents.xml") xml = xmlTree # Parse bigfileName = outdir+corpusName print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName="PARSED_TEXT", parseName="McClosky", requireEntities=True, timeout=60) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitTarget = "McClosky" xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, "SRNE-", ".xml")
def splitMergedElements(inputFilename, outputFilename=None): print >> sys.stderr, "##### Split elements with merged types #####" print >> sys.stderr, "Loading corpus", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {"entity":[0,0], "interaction":[0,0], "pair":[0,0]} for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, countsByType) print >> sys.stderr, "Results" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ": removed", countsByType[k][0], "created", countsByType[k][1] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def mixSets(input, output, docOrigIds, sourceSet, targetSet): print >> sys.stderr, "Mixing Sets", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if docOrigIds != None: for document in corpusRoot.getiterator("document"): if document.get("pmid") in docOrigIds: assert document.get("set") == sourceSet document.set("set", targetSet) docOrigIds.remove(document.get("pmid")) assert len(docOrigIds) == 0, docOrigIds sentenceIds = None if sentenceIds != None: for document in corpusRoot.getiterator("document"): removed = [] for sentence in document.findall("sentence"): assert document.get("set") == sourceSet sentenceId = sentence.get("id") if sentenceId in sentenceIds: removed.append(document.remove(sentence)) sentenceIds.remove(sentenceId) if len(removed) > 0: newDoc = ET.Element("document") for attr in document.attrib: newDoc.set(attr, document.get(attr)) newDoc.set("id", None) newDoc.set("set", targetSet) for sentence in removed: newDoc.append(sentence) corpusRoot.append(newDoc) assert len(sentenceIds) == None RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def splitMergedElements(inputFilename, outputFilename=None): print >> sys.stderr, "##### Split elements with merged types #####" print >> sys.stderr, "Loading corpus", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]} for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, countsByType) print >> sys.stderr, "Results" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ": removed", countsByType[k][ 0], "created", countsByType[k][1] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Loading corpus file", inputFilename if inputFilename.rsplit(".", 1)[-1] == "gz": import gzip corpusTree = ET.parse(gzip.open(inputFilename)) else: corpusTree = ET.parse(inputFilename) corpusRoot = corpusTree.getroot() countsByType = {} for key in sorted(rules.keys()): for attribute in rules[key]: countsByType[key + ":" + attribute] = 0 removeAttributes(corpusRoot, key, rules[key], countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Loading corpus file", inputFilename if inputFilename.rsplit(".",1)[-1] == "gz": import gzip corpusTree = ET.parse(gzip.open(inputFilename)) else: corpusTree = ET.parse(inputFilename) corpusRoot = corpusTree.getroot() countsByType = {} for key in sorted(rules.keys()): for attribute in rules[key]: countsByType[key + ":" + attribute] = 0 removeAttributes(corpusRoot, key, rules[key], countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() for eType in rules.keys(): for attrRule in rules[eType].keys(): rules[eType][attrRule] = rules[eType][attrRule].split("|") documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = defaultdict(int) for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
metavar="FILE") optparser.add_option("-o", "--output", default=None, dest="output", help="", metavar="FILE") (options, args) = optparser.parse_args() print >> sys.stderr, "Loading input file", options.input sourceTree = ET.parse(options.input) sourceRoot = sourceTree.getroot() print >> sys.stderr, "Merging named entity types" entities = sourceRoot.getiterator("entity") mergedByType = {} for entity in entities: if entity.attrib.has_key( "isName") and entity.attrib["isName"] == "True": if not mergedByType.has_key(entity.attrib["type"]): mergedByType[entity.attrib["type"]] = 0 mergedByType[entity.attrib["type"]] += 1 entity.attrib["type"] = "Gene/protein/RNA" print >> sys.stderr, "Merged:" for k in sorted(mergedByType.keys()): print >> sys.stderr, " " + k + ": " + str(mergedByType[k]) print >> sys.stderr, "Writing output", options.output ETUtils.write(sourceRoot, options.output)
corpusElement = ET.Element("corpus") corpusElement.attrib["source"] = "GENIA" totalSentences = 0 documentsWithSentences = [] for documentElement in documentElements: parseGraphs = documentElement.attrib["parseGraphs"] del documentElement.attrib["parseGraphs"] sentenceCount = 0 for parseGraph in parseGraphs: parseGraph.writeToInteractionXML(documentElement, sentenceCount) sentenceCount += 1 if sentenceCount > 0: documentsWithSentences.append(documentElement) totalSentences += sentenceCount visibleSet = Split.getSample(len(documentsWithSentences), options.visibleSet, 0) visibleSetDocuments = 0 visibleSetSentences = 0 for i in range(len(documentsWithSentences)): if visibleSet[i] == 0: documentElement = documentsWithSentences[i] corpusElement.append(documentElement) visibleSetDocuments += 1 visibleSetSentences += len(documentElement.findall("sentence")) ETUtils.write(corpusElement, options.output) print >> sys.stderr, "Total:", str(len(documentElements)) + " documents" print >> sys.stderr, "Total:", str(len(documentsWithSentences)) + " documents with sentences" print >> sys.stderr, "Total:", str(totalSentences) + " sentences" print >> sys.stderr, "Visible Set:", str(visibleSetDocuments) + " documents" print >> sys.stderr, "Visible Set:", str(visibleSetSentences) + " sentences"
def run(input, output=None, elementName="entity", processElement="document", splitNewlines=False, debug=False, bannerPath=None, trovePath=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() # Write text to input file workdir = tempfile.mkdtemp() if debug: print >> sys.stderr, "BANNER work directory at", workdir infile = codecs.open(os.path.join(workdir, "input.txt"), "wt", "utf-8") idCount = 0 for sentence in corpusRoot.getiterator(processElement): infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n") idCount += 1 infile.close() # Define classpath for java if bannerPath == None: bannerPath = Settings.BANNER_DIR if trovePath == None: trovePath = Settings.JAVA_TROVE_PATH libPath = "/lib/" # if not os.path.exists(bannerPath + libPath): # libPath = "/libs/" # assert os.path.exists(bannerPath + libPath) assert os.path.exists(bannerPath + libPath + "banner.jar"), bannerPath assert os.path.exists(trovePath), trovePath oldVersion = True classPath = bannerPath + "/bin" for filename in os.listdir(bannerPath + libPath): #if filename.endswith(".jar"): # classPath += ":" + bannerPath + libPath + filename if filename == "uima": oldVersion = False classPath += ":" + bannerPath + libPath + "*" # classPath += ":" + bannerPath + libPath + "banner.jar" # classPath += ":" + bannerPath + libPath + "dragontool.jar" # classPath += ":" + bannerPath + libPath + "heptag.jar" # classPath += ":" + bannerPath + libPath + "commons-collections-3.2.1.jar" # classPath += ":" + bannerPath + libPath + "commons-configuration-1.6.jar" # classPath += ":" + bannerPath + libPath + "commons-lang-2.4.jar" # classPath += ":" + bannerPath + libPath + "mallet.jar" # classPath += ":" + bannerPath + libPath + "commons-logging-1.1.1.jar" if oldVersion: classPath += ":" + trovePath # ":/usr/share/java/trove.jar" print >> sys.stderr, "Trove library at", trovePath config = makeConfigXML(workdir, bannerPath, oldVersion) # Run parser print >> sys.stderr, "Running BANNER", bannerPath cwd = os.getcwd() os.chdir(bannerPath) if oldVersion: # old version args = ["java", "-cp", classPath, "banner.eval.TestModel", config] else: args = ["java", "-cp", classPath, "banner.eval.BANNER", "test", config] print >> sys.stderr, "BANNER command:", " ".join(args) startTime = time.time() exitCode = subprocess.call(args) assert exitCode == 0, exitCode print >> sys.stderr, "BANNER time:", str(datetime.timedelta(seconds=time.time()-startTime)) os.chdir(cwd) # Put sentences in dictionary sDict = {} sentenceHasEntities = {} sCount = 0 for sentence in corpusRoot.getiterator(processElement): sDict["U" + str(sCount)] = sentence sentenceHasEntities["U" + str(sCount)] = False sCount += 1 sentencesWithEntities = 0 totalEntities = 0 nonSplitCount = 0 splitEventCount = 0 # TODO: mention.txt appears to contain predicted entities directly # To be able to feed BANNER documents (or poorly chopped sentences) # one should probably remove newlines, as BANNER separates its input # on newlines. Replacing all \r and \n characters should preserve the # character offsets. # Read BANNER results print >> sys.stderr, "Inserting entities" if oldVersion: outfile = codecs.open(os.path.join(workdir, "output.txt"), "rt", "utf-8") idfile = codecs.open(os.path.join(workdir, "ids.txt"), "rt", "utf-8") # Add output to sentences for line in outfile: bannerId = idfile.readline().strip() sentence = sDict[bannerId] # Find or create container elements sentenceId = sentence.get("id") sText = sentence.get("text") start = 0 entityCount = 0 beginOffset = None # Add tokens splits = line.strip().split() for split in splits: tokenText, tag = split.rsplit("|", 1) # Determine offsets by aligning BANNER-generated tokens to original text cStart = sText.find(tokenText, start) assert cStart != -1, (tokenText, tag, sText, line) cEnd = cStart + len(tokenText) - 1 start = cStart + len(tokenText) if tag == "O": if beginOffset != None: ## Make element #ent = ET.Element(elementName) #ent.set("id", sentenceId + ".e" + str(entityCount)) #ent.set("charOffset", str(beginOffset) + "-" + str(prevEnd)) #ent.set("type", "Protein") #ent.set("isName", "True") #ent.set("source", "BANNER") #ent.set("text", sText[beginOffset:prevEnd+1]) entities = makeEntityElements(beginOffset, prevEnd, sText, splitNewlines, elementName) assert len(entities) > 0 nonSplitCount += 1 if len(entities) > 1: splitEventCount += 1 for ent in entities: ent.set("id", sentenceId + ".e" + str(entityCount)) sentence.append(ent) if not sentenceHasEntities[bannerId]: sentencesWithEntities += 1 sentenceHasEntities[bannerId] = True totalEntities += 1 entityCount += 1 beginOffset = None else: if beginOffset == None: beginOffset = cStart prevEnd = cEnd outfile.close() idfile.close() else: sentenceEntityCount = {} mentionfile = codecs.open(os.path.join(workdir, "mention.txt"), "rt", "utf-8") for line in mentionfile: bannerId, offsets, word = line.strip().split("|") offsets = offsets.split() sentence = sDict[bannerId] offsets[0], offsets[1] = fixOffset(line.strip(), word, int(offsets[0]), int(offsets[1]), sentence.get("text")) entities = makeEntityElements(int(offsets[0]), int(offsets[1]), sentence.get("text"), splitNewlines, elementName) entityText = "\n".join([x.get("text") for x in entities]) assert entityText == word, (entityText, word, bannerId, offsets, sentence.get("id"), sentence.get("text")) assert len(entities) > 0, (line.strip(), sentence.get("text")) nonSplitCount += 1 if len(entities) > 1: splitEventCount += 1 if bannerId not in sentenceEntityCount: sentenceEntityCount[bannerId] = 0 for ent in entities: ent.set("id", sentence.get("id") + ".e" + str(sentenceEntityCount[bannerId])) sentence.append(ent) if not sentenceHasEntities[bannerId]: sentencesWithEntities += 1 sentenceHasEntities[bannerId] = True totalEntities += 1 sentenceEntityCount[bannerId] += 1 mentionfile.close() print >> sys.stderr, "BANNER found", nonSplitCount, "entities in", sentencesWithEntities, processElement + "-elements" print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "BANNER entities with newlines)" # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "BANNER working directory for debugging at", workdir if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def writeSVG(svgTokens, svgEdges, fileName): svgElement = makeSVG(svgTokens, svgEdges) ETUtils.write(svgElement, fileName) return svgElement
import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" assert(os.path.exists(sys.argv[1])) corpusTree = ETUtils.ETFromObj(sys.argv[1]) corpusRoot = corpusTree.getroot() resultRoot = ET.Element("root") specElement = ET.Element("speculation") resultRoot.append(specElement) negElement = ET.Element("negation") resultRoot.append(negElement) for sentence in corpusRoot.getiterator("sentence"): inSpec = False inNeg = False for entity in sentence.findall("entity"): if entity.get("speculation") == "True" and not inSpec: specElement.append(sentence) inSpec = True if entity.get("negation") == "True" and not inNeg: negElement.append(sentence) inNeg = True if inSpec and inNeg: break ETUtils.write(resultRoot, sys.argv[2])
def convert(datasets, analysisTags, analysisPath, corpusName): global moveBI bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets]) documents = [] for pair in datasets: print >> sys.stderr, "Reading", pair[0], "set,", sitesAreArguments = False if corpusName == "EPI": sitesAreArguments = True docs = ST.loadSet(pair[1], pair[0], "a2", sitesAreArguments=sitesAreArguments) print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Resolving equivalences" STFormat.Equiv.process(documents) print >> sys.stderr, "Checking data validity" for doc in documents: STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, "all-geniaformat", resultFileTag="a2", debug=False, task=2, validate=False) print >> sys.stderr, "Converting to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpusName, bigfileName + "-documents.xml") if corpusName == "BI": InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") for pair in datasets: if True: #corpusName != "BI": print >> sys.stderr, "Adding analyses for set", pair[0] addAnalyses(xml, analysisTags[pair[0]], analysisPath, bigfileName) ETUtils.write(xml, bigfileName + "-sentences.xml") processParses(corpusName, xml) # Write out converted data ETUtils.write(xml, bigfileName + ".xml") InteractionXML.MergeDuplicateEntities.mergeAll(xml, bigfileName + "-nodup.xml") for sourceTag in ["", "-nodup"]: print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus( bigfileName + sourceTag + ".xml", "./", corpusName + "-", sourceTag + ".xml", [("devel", "train")]) if "devel" in [x[0] for x in datasets]: print >> sys.stderr, "Converting back" STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml", "roundtrip/" + corpusName + "-devel" + sourceTag + "-task2", outputTag="a2", task=2) STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml", "roundtrip/" + corpusName + "-devel" + sourceTag + "-task1", outputTag="a2", task=1) if corpusName == "GE": print >> sys.stderr, "Evaluating task 2 back-conversion" BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName + "-devel" + sourceTag + "-task2", task=2, verbose=True, debug=False) print >> sys.stderr, "Evaluating task 1 back-conversion" BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName + "-devel" + sourceTag + "-task1", task=1, verbose=True, debug=False) elif corpusName in ["BI", "BB"]: print >> sys.stderr, "Evaluating task 2 back-conversion" BioNLP11GeniaTools.evaluateBX( "roundtrip/" + corpusName + "-devel" + sourceTag + "-task2", corpusName) print >> sys.stderr, "Evaluating task 1 back-conversion" BioNLP11GeniaTools.evaluateBX( "roundtrip/" + corpusName + "-devel" + sourceTag + "-task1", corpusName) print >> sys.stderr, "Creating empty devel set" deletionRules = {"interaction": {}, "entity": {"isName": "False"}} InteractionXML.DeleteElements.processCorpus( corpusName + "-devel" + sourceTag + ".xml", corpusName + "-devel" + sourceTag + "-empty.xml", deletionRules)
optparser.add_option("-o", "--output", default=None, dest="output", help="The file to which the new XML structure is saved. If None, will be the same as target.", metavar="FILE") (options, args) = optparser.parse_args() print >> sys.stderr, "Loading input file", options.input sourceTree = ET.parse(options.input) sourceRoot = sourceTree.getroot() print >> sys.stderr, "Removing dependencies" parsesElements = sourceRoot.getiterator("parses") for parsesElement in parsesElements: for parseElement in parsesElement.findall("parse"): dependencies = parseElement.findall("dependency") toRemove = [False] * len(dependencies) for i in range(0, len(dependencies)-1): for j in range(i+1, len(dependencies)): di = dependencies[i] dj = dependencies[j] if di.attrib["type"] == dj.attrib["type"] and di.attrib["t1"] == dj.attrib["t1"] and di.attrib["t2"] == dj.attrib["t2"]: toRemove[j] = True count = 0 for i in range(0, len(dependencies)): if toRemove[i]: parseElement.remove(dependencies[i]) count += 1 print >> sys.stderr, "Parse:", parseElement.attrib["parser"], "Removed:", count print >> sys.stderr, "Writing output", options.output ETUtils.write(sourceRoot, options.output)
def convertXML(parser, input, output, debug=False, reparse=False): global stanfordParserDir, stanfordParserArgs print >> sys.stderr, "Running Stanford conversion" print >> sys.stderr, "Stanford tools at:", stanfordParserDir print >> sys.stderr, "Stanford tools arguments:", " ".join(stanfordParserArgs) parseTimeStamp = time.strftime("%d.%m.%y %H:%M:%S") print >> sys.stderr, "Stanford time stamp:", parseTimeStamp print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() workdir = tempfile.mkdtemp() if debug: print >> sys.stderr, "Stanford parser workdir", workdir stanfordInput = os.path.join(workdir, "input") stanfordInputFile = codecs.open(stanfordInput, "wt", "utf-8") # Put penn tree lines in input file existingCount = 0 for sentence in corpusRoot.getiterator("sentence"): if sentence.find("sentenceanalyses") != None: # old format sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses") parses = setDefaultElement(sentenceAnalyses, "parses") parse = getElementByAttrib(parses, "parse", {"parser":parser}) else: analyses = setDefaultElement(sentence, "analyses") parse = getElementByAttrib(analyses, "parse", {"parser":parser}) if parse == None: continue if len(parse.findall("dependency")) > 0: if reparse: # remove existing stanford conversion for dep in parse.findall("dependency"): parse.remove(dep) del parse.attrib["stanford"] else: # don't reparse existingCount += 1 continue pennTree = parse.get("pennstring") if pennTree == None or pennTree == "": continue stanfordInputFile.write(pennTree + "\n") stanfordInputFile.close() if existingCount != 0: print >> sys.stderr, "Skipping", existingCount, "already converted sentences." # Run Stanford parser stanfordOutput = runSentenceProcess(runStanford, stanfordParserDir, stanfordInput, workdir, True, "StanfordParser", "Stanford Conversion", timeout=600, outputArgs={"encoding":"latin1", "errors":"replace"}) #stanfordOutputFile = codecs.open(stanfordOutput, "rt", "utf-8") stanfordOutputFile = codecs.open(stanfordOutput, "rt", "latin1", "replace") # Get output and insert dependencies noDepCount = 0 failCount = 0 sentenceCount = 0 for sentence in corpusRoot.getiterator("sentence"): # Get parse if sentence.find("sentenceanalyses") != None: # old format sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses") parses = setDefaultElement(sentenceAnalyses, "parses") parse = getElementByAttrib(parses, "parse", {"parser":parser}) else: analyses = setDefaultElement(sentence, "analyses") parse = getElementByAttrib(analyses, "parse", {"parser":parser}) if parse == None: parse = ET.SubElement(analyses, "parse") parse.set("parser", "None") if reparse: assert len(parse.findall("dependency")) == 0 elif len(parse.findall("dependency")) > 0: # don't reparse continue pennTree = parse.get("pennstring") if pennTree == None or pennTree == "": parse.set("stanford", "no_penn") continue parse.set("stanfordSource", "TEES") # parser was run through this wrapper parse.set("stanfordDate", parseTimeStamp) # links the parse to the log file # Get tokens if sentence.find("analyses") != None: tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")}) else: tokenization = getElementByAttrib(sentence.find("sentenceanalyses").find("tokenizations"), "tokenization", {"tokenizer":parse.get("tokenizer")}) assert tokenization != None count = 0 tokenByIndex = {} for token in tokenization.findall("token"): tokenByIndex[count] = token count += 1 # Insert dependencies deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, sentence.get("id")) if len(deps) == 0: parse.set("stanford", "no_dependencies") noDepCount += 1 if parse.get("stanfordAlignmentError") != None: failCount += 1 else: parse.set("stanford", "ok") if parse.get("stanfordAlignmentError") != None: failCount += 1 parse.set("stanford", "partial") sentenceCount += 1 stanfordOutputFile.close() # Remove work directory if not debug: shutil.rmtree(workdir) print >> sys.stderr, "Stanford conversion was done for", sentenceCount, "sentences,", noDepCount, "had no dependencies,", failCount, "failed" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True): global moveBI workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally datasets = ["devel", "train", "test"] bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile, "temp at ", sitesAreArguments = False if corpus == "EPI": sitesAreArguments = True docs = ST.loadSet(sourceFile, setName, "a2", sitesAreArguments=sitesAreArguments) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Resolving equivalences" STFormat.Equiv.process(documents) if evaluate: print >> sys.stderr, "Checking data validity" for doc in documents: STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False, task=2, validate=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName + "-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI": InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") addAnalyses(xml, corpus, datasets, files, bigfileName) if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml" ETUtils.write(xml, bigfileName + "-sentences.xml") processParses(xml) print >> sys.stderr, "---------------", "Writing corpora", "---------------" # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml" ETUtils.write(xml, bigfileName + ".xml") print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" print >> sys.stderr, "Converting back" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", task=1) STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2", task=2) print >> sys.stderr, "Evaluating task 1 back-conversion" BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
def mainFunc(input, output, parseName, tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True): print >> sys.stderr, "Protein Name Splitter" if logFileName != None: print >> sys.stderr, "Writing log to", logFileName logFile = open(logFileName, "wt") else: logFile = None #if input.endswith(".gz"): # inFile = gzip.GzipFile(input) #else: # inFile = open(input) tree = ETUtils.ETFromObj(input) if tokenizationName == None: tokenizationName = parseName #tree = ElementTree.parse(inFile) root = tree.getroot() sentences = [x for x in root.getiterator("sentence")] counter = ProgressCounter(len(sentences), "Split Protein Names") counter.showMilliseconds = True missingTokCount = 0 for sentence in sentences: sId = sentence.get("id") counter.update(1, "Splitting names (" + sId + "): ") tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld) if tok == None: missingTokCount += 1 continue assert tok is not None, "Missing tokenization '%s' in sentence %s!" % ( tokenizationName, sId) parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld) assert parse is not None, "Missing parse '%s' in sentence %s!" % ( parseName, sId) split = splitTokens(tok, sentence, logFile) # Default names if removeOld: if newTokenizationName == None: newTokenizationName = tok.get("tokenizer") if newParseName == None: newParseName = parse.get("parser") else: if newTokenizationName == None: newTokenizationName = "split-" + tok.get("tokenizer") if newParseName == None: newParseName = "split-" + parse.get("parser") # add a new tokenization with the split tokens. splittok = addTokenization(newTokenizationName, sentence, sId) addTokensToTree(split, splittok) for a in tok.attrib: if splittok.get(a) == None: splittok.set(a, tok.get(a)) #splittok.set("split-") # make a mapping from original to split token ids. Store the # head token when given. tokenIdMap = {} for t in split: if t.head: head = t.head # traverse while head.head is not None: assert head.head != t, "Cyclic heads" head = head.head # should match (nah, punctuation problems) # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict" tokenIdMap[t.origId] = head.id else: # only allow overwrite of existing entry if the current token # is not punctuation. if t.origId not in tokenIdMap or not t.isPunct(): tokenIdMap[t.origId] = t.id # make a copy of the specified parse that refers to the split tokens # instead of the originals. newparse = addParse(newParseName, newTokenizationName, sentence, sId) for a in parse.attrib: if newparse.get(a) == None: newparse.set(a, parse.get(a)) newparse.set("ProteinNameSplitter", "True") splittok.set("ProteinNameSplitter", "True") depSeqId = 1 for d in parse.getiterator("dependency"): t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type") assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR" dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", tokenIdMap[t1]) dep.set("t2", tokenIdMap[t2]) dep.set("type", dType) dep.set("id", "split_%d" % depSeqId) depSeqId += 1 # Add in new dependencies between the split parts. for t in [tok for tok in split if tok.head is not None]: dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", t.head.id) dep.set("t2", t.id) dep.set("type", t.depType) dep.set("split", "PNS") for phrase in parse.getiterator("phrase"): newparse.append(phrase) # debugging #print >> sys.stderr, "NEW DEP IN", sId print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences" #indent(root) if logFile != None: logFile.close() # debugging if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(tree, output) return tree
def makeSentences(input, tokenizationPath, output=None, removeText=False): """ Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Inserting tokenizations from", tokenizationPath if tokenizationPath.find(".tar.gz") != -1: tarFilePath, tokenizationPath = tokenizationPath.split(".tar.gz") tarFilePath += ".tar.gz" tarFile = tarfile.open(tarFilePath) if tokenizationPath[0] == "/": tokenizationPath = tokenizationPath[1:] else: tarFile = None docCount = 0 docsWithSentences = 0 sentencesCreated = 0 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "Sentence Splitting") for document in sourceElements: docCount += 1 counter.update(1, "Splitting Documents ("+document.get("id")+"/" + document.get("pmid") + "): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) if document.find("sentence") == None: # no existing sentence split text = document.get("text") if text == None or text.strip() == "": continue newFile = os.path.join(tokenizationPath, document.get("pmid") + ".tok") f = openFile(newFile, tarFile) if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension oldFile = os.path.join(tokenizationPath, document.get("pmid") + ".tokenized") f = openFile(newFile, oldFile) if f == None: # no tokenization found continue sentencesCreated += alignSentences(document, f.readlines()) f.close() # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docsWithSentences += 1 else: docsWithSentences += 1 if tarFile != None: tarFile.close() print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def buildExamples(corpusDir, outPath): # define shortcuts for commonly used files PARSE = "stanford-newMC-intra" #"split-Charniak-Lease" TOK = "split-McClosky" CORPUS_DIR = corpusDir # xml files without heads BI_DEVEL_FILE = CORPUS_DIR + "/bioinfer.devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TEST_FILE = CORPUS_DIR + "/bioinfer.test.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TRAIN_FILE = CORPUS_DIR + "/bioinfer.train.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" BI_TRAIN_AND_DEVEL_FILE = CORPUS_DIR + "/bioinfer.train+devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml" # xml files with head tokens TEST_FILE = outpath + "/bioinfer-test-" + PARSE + ".xml" DEVEL_FILE = outpath + "/bioinfer-devel-" + PARSE + ".xml" TRAIN_FILE = outpath + "/bioinfer-train-" + PARSE + ".xml" TRAIN_AND_DEVEL_FILE = outpath + "/bioinfer-train-and-devel-" + PARSE + ".xml" WORKDIR = outpath # Find heads sys.path.append("..") import Core.SentenceGraph as SentenceGraph import cElementTreeUtils as ETUtils if not os.path.exists(TEST_FILE): c = SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TEST_FILE) if not os.path.exists(DEVEL_FILE): c = SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK) ETUtils.write(c.rootElement, DEVEL_FILE) if not os.path.exists(TRAIN_FILE): c = SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TRAIN_FILE) if not os.path.exists(TRAIN_AND_DEVEL_FILE): c = SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK) ETUtils.write(c.rootElement, TRAIN_AND_DEVEL_FILE) ############################################################################### # Trigger example generation ############################################################################### print >> sys.stderr, "Trigger examples for parse", TOK if not os.path.exists("gazetteer-train-" + TOK): Gazetteer.run(TRAIN_FILE, "gazetteer-train-" + TOK, TOK) if not os.path.exists("gazetteer-train-and-devel-" + TOK): Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-" + TOK, TOK) # generate the files for the old charniak if not os.path.exists("trigger-train-examples-" + PARSE): GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE, "trigger-train-examples-" + PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-" + TOK) if not os.path.exists("trigger-devel-examples-" + PARSE): GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE, "trigger-devel-examples-" + PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-" + TOK) if not os.path.exists("trigger-train-and-devel-examples-" + PARSE): GeneralEntityTypeRecognizerGztr.run( TRAIN_AND_DEVEL_FILE, "trigger-train-and-devel-examples-" + PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-" + TOK) if not os.path.exists("trigger-test-examples-" + PARSE): GeneralEntityTypeRecognizerGztr.run(TEST_FILE, "trigger-test-examples-" + PARSE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-" + TOK) ############################################################################### # Edge example generation ############################################################################### print >> sys.stderr, "Edge examples for parse", PARSE EDGE_FEATURE_PARAMS = "style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits" if not os.path.exists("edge-train-examples-" + PARSE): MultiEdgeExampleBuilder.run(TRAIN_FILE, "edge-train-examples-" + PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") if not os.path.exists("edge-devel-examples-" + PARSE): MultiEdgeExampleBuilder.run(DEVEL_FILE, "edge-devel-examples-" + PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") if not os.path.exists("edge-train-and-devel-examples-" + PARSE): MultiEdgeExampleBuilder.run(TRAIN_AND_DEVEL_FILE, "edge-train-and-devel-examples-" + PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids") # NOTE! These TEST examples will be based on gold standard triggers! if not os.path.exists("edge-test-examples-" + PARSE): MultiEdgeExampleBuilder.run(TEST_FILE, "edge-test-examples-" + PARSE, PARSE, TOK, EDGE_FEATURE_PARAMS, "bioinfer-edge-ids")
for pW,cls in prds: result[idx][cls]=pW/float(total) assert result[idx][cls]<=1.0 and result[idx][cls]>=0.0,"%f/%f=%f"%(pW,total,result[idx][cls]) for d in result: d["neg"]=d.get("neg",0.0)*boost return result if __name__=="__main__": desc="Weighted combination of several trigger word recognizers" parser = OptionParser(description=desc) parser.add_option("--lambda",dest="l",action="store",default=None,type="float",help="The mixing weight of predictions1 with predictions2. A number between 0 and 1. No default.") parser.add_option("--b1",dest="b1",action="store",default=None,type="float",help="Recall boost of file1") parser.add_option("--b2",dest="b2",action="store",default=None,type="float",help="Recall boost of file2") (options, args) = parser.parse_args() if options.l==None: print >> sys.stderr, "You need to give a lambda" sys.exit(1) tree1=ET.parse(args[0]).getroot() tree2=ET.parse(args[1]).getroot() assert len(tree1)==len(tree2) for docIdx in range(len(tree1)): assert len(tree1[docIdx])==len(tree2[docIdx]) for sIdx in range(len(tree1[docIdx])): newSNode=merge2sents(tree1[docIdx][sIdx],tree2[docIdx][sIdx],options.l,options.b1,options.b2) tree1[docIdx][sIdx]=newSNode ETUtils.write(tree1,sys.stdout)
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None): #print >> sys.stderr, "Writing output to Interaction XML" corpus = self.loadCorpus(corpus, parse, tokenization) if goldCorpus != None: goldCorpus = self.loadCorpus(corpus, parse, tokenization) examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() #counter = ProgressCounter(len(corpus.sentences), "Write Examples") exampleQueue = [] # One sentence's examples predictionsByExample = {} currentMajorId = None prevMajorIds = set() processedSentenceIds = set() xType = None count = 0 for example in examples: count += 1 assert count > 0 progress = ProgressCounter(count, "Write Examples") for example, prediction in itertools.izip_longest(examples, predictions): assert example != None assert prediction != None majorId, minorId = example[0].rsplit(".x", 1) #if currentMajorId == "GENIA.d114.s9": print "Start" if majorId != currentMajorId: # new sentence if currentMajorId != None: #if currentMajorId == "GENIA.d114.s9": print "JAA" processedSentenceIds.add(currentMajorId) sentenceObject = corpus.sentencesById[currentMajorId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ") exampleQueue = [] predictionsByExample = {} prevMajorIds.add(currentMajorId) assert majorId not in prevMajorIds, majorId currentMajorId = majorId exampleQueue.append(example) # queue example predictionsByExample[example[0]] = prediction assert example[3]["xtype"] == self.xType, str(example[3]["xtype"]) + "/" + str(self.xType) # Process what is still in queue if currentMajorId != None: processedSentenceIds.add(currentMajorId) sentenceObject = corpus.sentencesById[currentMajorId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ") exampleQueue = [] predictionsByExample = {} # Process sentences with no examples (e.g. to clear interactions) for sentenceId in sorted(corpus.sentencesById.keys()): if sentenceId not in processedSentenceIds: sentenceObject = corpus.sentencesById[sentenceId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence([], {}, sentenceObject, classSet, classIds, goldSentence=goldSentence) # Print statistics if len(self.counts) > 0: print >> sys.stderr, self.counts self.counts = defaultdict(int) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpus.rootElement, outputFile) return corpus.tree
def tokenize(input, output=None, tokenizationName="GeniaTagger-3.0.1", extraFields=[]): #["base", "chunk", "NE"]): global geniaTaggerDir print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() # Write text to input file workdir = tempfile.mkdtemp() infile = codecs.open(os.path.join(workdir, "tagger-input.txt"), "wt", "utf-8") numCorpusSentences = 0 for sentence in corpusRoot.getiterator("sentence"): infile.write(sentence.get("text") + "\n") numCorpusSentences += 1 infile.close() # Run tagger cwd = os.getcwd() os.chdir(geniaTaggerDir) args = [geniaTaggerDir + "/geniatagger"] #args += [ "<", os.path.join(workdir, "tagger-input.txt")] #args += [ ">", os.path.join(workdir, "tagger-output.txt")] #subprocess.call(args, process = subprocess.Popen( args, stdin=codecs.open(os.path.join(workdir, "tagger-input.txt"), "rt", "utf-8"), stdout=codecs.open(os.path.join(workdir, "tagger-output.txt"), "wt", "utf-8")) waitForProcess(process, numCorpusSentences, True, os.path.join(workdir, "tagger-output.txt"), "GeniaTagger", "Tokenizing Sentences") os.chdir(cwd) # Read tokenization outfile = codecs.open(os.path.join(workdir, "tagger-output.txt"), "rt", "utf-8") # Add output to sentences for sentence in corpusRoot.getiterator("sentence"): # Find or create container elements sentenceAnalyses = sentence.find("sentenceAnalyses") if sentenceAnalyses == None: sentenceAnalyses = ET.Element("sentenceAnalyses") sentence.append(sentenceAnalyses) tokenizations = sentenceAnalyses.find("tokenizations") if tokenizations == None: tokenizations = ET.Element("tokenizations") sentenceAnalyses.append(tokenizations) prevTokenizationIndex = 0 for prevTokenization in tokenizations.findall("tokenization"): assert prevTokenization.get("tokenizer") != tokenizationName prevTokenizationIndex += 1 tokenization = ET.Element("tokenization") tokenization.set("tokenizer", tokenizationName) tokenizations.insert(prevTokenizationIndex, tokenization) sText = sentence.get("text") start = 0 tokenCount = 0 line = outfile.readline() while line.strip() != "": # Add tokens splits = line.strip().split("\t") # Determine offsets cStart = sText.find(splits[0], start) if cStart == -1: if splits[0] == "``": splits[0] = "\"" if splits[0] == "''": splits[0] = "\"" cStart = sText.find(splits[0], start) assert cStart != -1, (sentence.get("id"), sText, line, tokenCount) cEnd = cStart + len(splits[0]) start = cStart + len(splits[0]) # Make element token = ET.Element("token") token.set("id", "gt_" + str(tokenCount + 1)) token.set("text", splits[0]) if "base" in extraFields: token.set("base", splits[1]) token.set("POS", splits[2]) if "chunk" in extraFields: token.set("chunk", splits[3]) if "NE" in extraFields: token.set("NE", splits[4]) token.set("charOffset", str(cStart) + "-" + str(cEnd - 1)) # NOTE: check tokenization.append(token) tokenCount += 1 line = outfile.readline() outfile.close() # Remove work directory shutil.rmtree(workdir) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
selection = sentence.attrib["origId"] in idList if options.invert: selection = not selection assert (keep == None or keep == selection) keep = selection if not keep: corpusRoot.remove(document) removedDocuments += 1 removedSentences += len(sentences) else: keptDocuments += 1 keptSentences += len(sentences) else: selection = documentSets[i] != 0 if options.invert: selection = not selection if selection: corpusRoot.remove(document) removedDocuments += 1 removedSentences += len(sentences) else: keptDocuments += 1 keptSentences += len(sentences) print >> sys.stderr, "Corpus:", keptDocuments + removedDocuments, "documents,", keptSentences + removedSentences, "sentences." print >> sys.stderr, "Removed:", removedDocuments, "documents,", removedSentences, "sentences." print >> sys.stderr, "Subset:", keptDocuments, "documents,", keptSentences, "sentences." print >> sys.stderr, "Writing subset to", options.output ETUtils.write(corpusRoot, options.output)
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False): """ Run GENIA Sentence Splitter Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ global sentenceSplitterDir print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR, if postProcess: print >> sys.stderr, "(Using post-processing)" else: print >> sys.stderr, "(No post-processing)" docCount = 0 sentencesCreated = 0 redivideCount = 0 sourceElements = [x for x in corpusRoot.getiterator("document") ] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter") counter.showMilliseconds = True # Create working directory workdir = tempfile.mkdtemp() for document in sourceElements: counter.update(1, "Splitting Documents (" + document.get("id") + "): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) docTag = "-" + str(docCount) assert document.find("sentence") == None text = document.get("text") if text == None or text.strip() == "": continue #print type(text) # Write text to workfile #workdir = tempfile.mkdtemp() workfile = codecs.open( os.path.join(workdir, "sentence-splitter-input.txt" + docTag), "wt", "utf-8") # From http://themoritzfamily.com/python-encodings-and-unicode.html # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode # object otherwise it will try to automatically decode the byte stream as ASCII" # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII, # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this # point we should be able to safely write(text), as the output file is unicode, and reading with # the correct coded is taken care of earlier in the pipeline. workfile.write(text) #.encode("utf-8")) workfile.close() # Run sentence splitter assert os.path.exists( Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR args = [ Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt" + docTag), os.path.join(workdir, "sentence-splitter-output.txt" + docTag), Settings.RUBY_PATH ] #p = subprocess.call(args) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if stdout != "": print >> sys.stderr, stdout if stderr != 'Extracting events.roading model file.\nstart classification.\n': print >> sys.stderr, stderr #print "stdout<", p.stdout.readlines(), ">" #print "stderr<", p.stderr.readlines(), ">" if postProcess: ppIn = codecs.open( os.path.join(workdir, "sentence-splitter-output.txt" + docTag), "rt", "utf-8") ppOut = codecs.open( os.path.join( workdir, "sentence-splitter-output-postprocessed.txt" + docTag), "wt", "utf-8") subprocess.call(os.path.join(Settings.GENIA_SENTENCE_SPLITTER_DIR, "geniass-postproc.pl"), stdin=ppIn, stdout=ppOut) ppIn.close() ppOut.close() # Read split sentences workfile = codecs.open( os.path.join( workdir, "sentence-splitter-output-postprocessed.txt" + docTag), "rt", "utf-8") else: workfile = codecs.open( os.path.join(workdir, "sentence-splitter-output.txt" + docTag), "rt", "utf-8") start = 0 # sentences are consecutively aligned to the text for charOffsets sentenceCount = 0 #text = text.replace("\n", " ") # should stop sentence splitter from crashing. #text = text.replace(" ", " ") # should stop sentence splitter from crashing. #alignmentText = text.replace("\n", " ").replace("\r", " ") #docTokens = reWhiteSpace.split(text) docIndex = 0 sentenceBeginIndex = -1 prevSentence = None prevEndIndex = None emptySentenceCount = 0 prevText = None for sText in workfile.readlines(): sText = sText.strip() # The text of the sentence if sText == "": emptySentenceCount += 1 continue for i in range(len(sText)): if sText[i].isspace(): assert sText[i] not in ["\n", "\r"] continue while text[docIndex].isspace(): if text[docIndex] in ["\n", "\r" ] and sentenceBeginIndex != -1: redivideCount += 1 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex - 1, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevSentence.set("redevided", "True") sentencesCreated += 1 sentenceCount += 1 prevEndIndex = docIndex - 1 sentenceBeginIndex = -1 document.append(prevSentence) docIndex += 1 assert sText[i] == text[docIndex], ( text, sText, prevText, sText[i:i + 10], text[docIndex:docIndex + 10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False if sentenceBeginIndex == -1: sentenceBeginIndex = docIndex docIndex += 1 prevText = sText if sentenceBeginIndex != -1: prevSentence = makeSentence(text, sentenceBeginIndex, docIndex - 1, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevEndIndex = docIndex - 1 sentenceBeginIndex = -1 sentencesCreated += 1 sentenceCount += 1 document.append(prevSentence) # Add possible tail for last sentence if prevEndIndex < len(text) - 1 and prevSentence != None: assert prevSentence.get("tail") == None, prevSentence.get("tail") prevSentence.set("tail", text[prevEndIndex + 1:]) if emptySentenceCount > 0: print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get( "id") # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docCount += 1 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, "Redivided", redivideCount, "sentences" if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: # Remove work directory shutil.rmtree(workdir) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}): import tarfile from SentenceSplitter import openFile """ Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Inserting parses from", parsePath if parsePath.find(".tar.gz") != -1: tarFilePath, parsePath = parsePath.split(".tar.gz") tarFilePath += ".tar.gz" tarFile = tarfile.open(tarFilePath) if parsePath[0] == "/": parsePath = parsePath[1:] else: tarFile = None docCount = 0 failCount = 0 sentenceCount = 0 docsWithStanford = 0 sentencesCreated = 0 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion") for document in sourceElements: docCount += 1 docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) f = openFile(os.path.join(parsePath, document.get("pmid") + ".sd"), tarFile) if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension f = openFile(os.path.join(parsePath, document.get("pmid") + ".dep"), tarFile) if f != None: sentences = document.findall("sentence") # TODO: Following for-loop is the same as when used with a real parser, and should # be moved to its own function. for sentence in sentences: sentenceCount += 1 counter.update(0, "Processing Documents ("+sentence.get("id")+"/" + document.get("pmid") + "): ") if not insertParse(sentence, f, parseName, extraAttributes={}): failCount += 1 f.close() counter.update(1, "Processing Documents ("+document.get("id")+"/" + document.get("pmid") + "): ") if tarFile != None: tarFile.close() #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have stanford parses" print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences,", failCount, "failed" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
tags = ["e1", "e2"] for sentence in corpusElements.sentences: counter.update( 1, "Resolving chains for (" + sentence.sentence.attrib["id"] + "): ") identityChainDict = {} tokenHeadScores = sentence.sentenceGraph.getTokenHeadScores() for interaction in sentence.interactions: if interaction.attrib["type"] == "identity": e1 = sentence.entitiesById[interaction.attrib["e1"]] e2 = sentence.entitiesById[interaction.attrib["e2"]] t1 = sentence.sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentence.sentenceGraph.entityHeadTokenByEntity[e2] if tokenHeadScores[t2] > tokenHeadScores[t1]: identityChainDict[ interaction.attrib["e1"]] = interaction.attrib["e2"] else: identityChainDict[ interaction.attrib["e2"]] = interaction.attrib["e1"] for interaction in sentence.interactions: if interaction.attrib["type"] != "identity": for tag in tags: id = interaction.attrib[tag] while identityChainDict.has_key(id): id = identityChainDict[id] if id != interaction.attrib[tag]: interaction.attrib[tag] = id print >> sys.stderr, "Writing output", options.output ETUtils.write(corpusElements.rootElement, options.output)
selection = sentence.attrib["origId"] in idList if options.invert: selection = not selection assert keep == None or keep == selection keep = selection if not keep: corpusRoot.remove(document) removedDocuments += 1 removedSentences += len(sentences) else: keptDocuments += 1 keptSentences += len(sentences) else: selection = documentSets[i] != 0 if options.invert: selection = not selection if selection: corpusRoot.remove(document) removedDocuments += 1 removedSentences += len(sentences) else: keptDocuments += 1 keptSentences += len(sentences) print >>sys.stderr, "Corpus:", keptDocuments + removedDocuments, "documents,", keptSentences + removedSentences, "sentences." print >>sys.stderr, "Removed:", removedDocuments, "documents,", removedSentences, "sentences." print >>sys.stderr, "Subset:", keptDocuments, "documents,", keptSentences, "sentences." print >>sys.stderr, "Writing subset to", options.output ETUtils.write(corpusRoot, options.output)
def interface(optionArgs=sys.argv[1:]): """ The function to handle the command-line interface. """ from optparse import OptionParser op = OptionParser( usage="%prog [options]\nGenia shared task specific unflattening.") op.add_option("-i", "--infile", dest="infile", help="Input file (gifxml)", metavar="FILE") op.add_option("-o", "--outfile", dest="outfile", help="Output file (gifxml)", metavar="FILE") op.add_option( "-p", "--perfect", dest="perfect", help="Process only those event which can be perfectly solved", action="store_true", default=False) op.add_option("-a", "--parse", dest="parse", help="Parse to be used", metavar="PARSE") op.add_option("-t", "--tokens", dest="tokens", help="Tokens to be used", metavar="TOKENS") (options, args) = op.parse_args(optionArgs) quit = False if not options.infile: print "Please specify the input file." quit = True # if not options.outfile: # print "Please specify the output file." # quit = True if not options.parse: print "Please specify the parse." quit = True if not options.tokens: print "Please specify the tokenisation." quit = True if quit: op.print_help() return (False) corpus = ETUtils.ETFromObj(options.infile) documents = corpus.getroot().findall('document') counter = ProgressCounter(len(documents), "Unflatten") for document in documents: counter.update(1, "Unflattening (" + document.get("id") + "): ") #sys.stderr.write("Unflattening document %s\n"%document.attrib['id']) unflattener = Unflattener(document, options.perfect, options.tokens, options.parse) #if len(unflattener.tokens) == 0: # continue unflattener.analyse() unflattener.unflatten() #indent(corpus.getroot()) if options.outfile: ETUtils.write(corpus, options.outfile) return corpus
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False): """ Run GENIA Sentence Splitter Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ global sentenceSplitterDir print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR, if postProcess: print >> sys.stderr, "(Using post-processing)" else: print >> sys.stderr, "(No post-processing)" docCount = 0 sentencesCreated = 0 redivideCount = 0 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter") counter.showMilliseconds = True # Create working directory workdir = tempfile.mkdtemp() for document in sourceElements: counter.update(1, "Splitting Documents ("+document.get("id")+"): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) docTag = "-" + str(docCount) assert document.find("sentence") == None text = document.get("text") if text == None or text.strip() == "": continue #print type(text) # Write text to workfile #workdir = tempfile.mkdtemp() workfile = codecs.open(os.path.join(workdir, "sentence-splitter-input.txt"+docTag), "wt", "utf-8") # From http://themoritzfamily.com/python-encodings-and-unicode.html # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode # object otherwise it will try to automatically decode the byte stream as ASCII" # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII, # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this # point we should be able to safely write(text), as the output file is unicode, and reading with # the correct coded is taken care of earlier in the pipeline. workfile.write(text) #.encode("utf-8")) workfile.close() # Run sentence splitter assert os.path.exists(Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR args = [Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt"+docTag), os.path.join(workdir, "sentence-splitter-output.txt"+docTag), Settings.RUBY_PATH] #p = subprocess.call(args) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if stdout != "": print >> sys.stderr, stdout if stderr != 'Extracting events.roading model file.\nstart classification.\n': print >> sys.stderr, stderr #print "stdout<", p.stdout.readlines(), ">" #print "stderr<", p.stderr.readlines(), ">" if postProcess: ppIn = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8") ppOut = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "wt", "utf-8") subprocess.call(os.path.join(Settings.GENIA_SENTENCE_SPLITTER_DIR, "geniass-postproc.pl"), stdin=ppIn, stdout=ppOut) ppIn.close() ppOut.close() # Read split sentences workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "rt", "utf-8") else: workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8") start = 0 # sentences are consecutively aligned to the text for charOffsets sentenceCount = 0 #text = text.replace("\n", " ") # should stop sentence splitter from crashing. #text = text.replace(" ", " ") # should stop sentence splitter from crashing. #alignmentText = text.replace("\n", " ").replace("\r", " ") #docTokens = reWhiteSpace.split(text) docIndex = 0 sentenceBeginIndex = -1 prevSentence = None prevEndIndex = None emptySentenceCount = 0 prevText = None for sText in workfile.readlines(): sText = sText.strip() # The text of the sentence if sText == "": emptySentenceCount += 1 continue for i in range(len(sText)): if sText[i].isspace(): assert sText[i] not in ["\n", "\r"] continue while text[docIndex].isspace(): if text[docIndex] in ["\n", "\r"] and sentenceBeginIndex != -1: redivideCount += 1 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevSentence.set("redevided", "True") sentencesCreated += 1 sentenceCount += 1 prevEndIndex = docIndex-1 sentenceBeginIndex = -1 document.append(prevSentence) docIndex += 1 assert sText[i] == text[docIndex], (text, sText, prevText, sText[i:i+10], text[docIndex:docIndex+10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False if sentenceBeginIndex == -1: sentenceBeginIndex = docIndex docIndex += 1 prevText = sText if sentenceBeginIndex != -1: prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevEndIndex = docIndex-1 sentenceBeginIndex = -1 sentencesCreated += 1 sentenceCount += 1 document.append(prevSentence) # Add possible tail for last sentence if prevEndIndex < len(text) - 1 and prevSentence != None: assert prevSentence.get("tail") == None, prevSentence.get("tail") prevSentence.set("tail", text[prevEndIndex+1:]) if emptySentenceCount > 0: print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id") # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docCount += 1 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, "Redivided", redivideCount, "sentences" if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: # Remove work directory shutil.rmtree(workdir) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
EDGE_TEST_EXAMPLE_FILE="edge-test-examples-"+PARSE EDGE_CLASS_NAMES="bioinfer-edge-ids.class_names" EDGE_FEATURE_PARAMS="style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits" if True: ############################################################################### # Head token detection ############################################################################### # Find heads sys.path.append("..") import Core.SentenceGraph as SentenceGraph import cElementTreeUtils as ETUtils ETUtils.write(SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK).rootElement, TEST_FILE) ETUtils.write(SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK).rootElement, DEVEL_FILE) ETUtils.write(SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK).rootElement, TRAIN_FILE) ETUtils.write(SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK).rootElement, TRAIN_AND_DEVEL_FILE) ############################################################################### # Trigger example generation ############################################################################### print >> sys.stderr, "Trigger examples for parse", PARSE Gazetteer.run(TRAIN_FILE, "gazetteer-train-"+TOK, TOK) Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-"+TOK, TOK) # Generate example files GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE, TRIGGER_TRAIN_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK) GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE, TRIGGER_DEVEL_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK) GeneralEntityTypeRecognizerGztr.run(TRAIN_AND_DEVEL_FILE, TRIGGER_TRAIN_AND_DEVEL_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK) GeneralEntityTypeRecognizerGztr.run(TEST_FILE, TRIGGER_TEST_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK)