def load(self, input, dataSetNames=None, corpusName=None, output=None, extensions=None): if isinstance(input, basestring) and input.isdigit(): return self.downloadPubmed(input, output) elif isinstance(input, basestring) and (os.path.isdir(input) or input.endswith(".tar.gz") or input.endswith(".txt") or "," in input): return self.convert(input, dataSetNames, corpusName, output, extensions=extensions) elif isinstance(input, basestring) and not os.path.exists(input): fullPath = os.path.join(Settings.CORPUS_DIR, input) print >> sys.stderr, "Loading installed corpus from", fullPath if os.path.exists(fullPath): return ETUtils.ETFromObj(fullPath) else: #setPaths = [fullPath + x for x in ("-train.xml", "-devel.xml", "-test.xml")] pattern = input + ".+\.xml" #"|".join([input + x for x in ("-train.xml", "-devel.xml", "-test.xml")]) matching = Utils.InteractionXML.MergeSets.getMatchingFiles( pattern, Settings.CORPUS_DIR) if len(matching) == 0: matching = Utils.InteractionXML.MergeSets.getMatchingFiles( pattern) if len(matching ) > 0: #any([os.path.exists(x) for x in setPaths]): return Utils.InteractionXML.MergeSets.mergeSets(pattern) else: raise Exception("Cannot find input '" + str(input) + "'") elif isinstance(input, dict): return Utils.InteractionXML.MergeSets.mergeSets(pattern) else: print >> sys.stderr, "Processing source as interaction XML" return ETUtils.ETFromObj(input)
def convertXML(xml, outPath): xml = ETUtils.ETFromObj(xml) corpusObj = {"name": None, "children": []} root = xml.getroot() for document in root.getiterator("document"): docObj = addChild(corpusObj, document) for sentence in document.getiterator("sentence"): sentObj = addChild(docObj, sentence) for elType in ("entity", "interaction"): for element in sentence.getiterator(elType): addChild(sentObj, element) with open(outPath, "wt") as f: json.dump(corpusObj, f, indent=2, cls=IJSONEncoder)
def validateCorpus(input, output, strict=True): print >> sys.stderr, "Validating XML" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = validate(corpusRoot, strict) print >> sys.stderr, "Corpus validated:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def negateEvents(input, output=None, verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: for entity in sentence.findall("entity"): counts["all-entities"] += 1 eType = entity.get("type") if not isNegatableEPITrigger(eType): counts["out-of-scope"] += 1 continue eBaseType = getEPIBaseType(eType) eText = entity.get("text").lower() eNewType = determineNewType(eType, eText) # Insert changed charOffset counts["entities"] += 1 if verbose: print "Entity", entity.get("id"), [entity.get("text")], [ eType, eBaseType, eNewType ], if eNewType != eBaseType: counts["negated"] += 1 if verbose: print "NEGATED", if eNewType == eType: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 if eNewType == eBaseType: counts["incorrect-pos"] += 1 else: counts["incorrect-neg"] += 1 if verbose: print "INCORRECT" entity.set("type", eNewType) if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): sentenceId = sentence.get("id") if idfilter != None and idfilter not in sentenceId: continue # Output entities if mode == "entities": for entity in sentence.findall("entity"): if entity.get("type") != "neg": outFile.write(sentenceId) offsets = Range.charOffsetToTuples( entity.get("charOffset")) for i in range(len(offsets)): offsets[i] = (offsets[i][0], offsets[i][1] - 1) outFile.write( "|" + Range.tuplesToCharOffset(offsets, rangeSep=";")) outFile.write("|" + entity.get("text")) outFile.write("|" + entity.get("type")) outFile.write("\n") if mode == "interactions": # First determine which pairs interact intMap = defaultdict(lambda: defaultdict(lambda: None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg" and interaction.get( "given") != "True": intMap[interaction.get("e1")][interaction.get( "e2")] = interaction intMap[interaction.get("e2")][interaction.get( "e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities) - 1): for j in range(i + 1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|") if intMap[eIId][eJId] != None: interaction = intMap[eIId][eJId] assert interaction.get("type") != "neg" outFile.write("1|" + interaction.get("type") + "\n") else: outFile.write("0|null\n") outFile.close()
def processCorpus(inputFilename, outputFilename, rules, reverse=False): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() countsByType = defaultdict(int) removeElements(corpusRoot, rules, reverse, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def loadCorpus(filename, parse=None, tokenization=None, removeIntersentenceInteractions=True, removeNameInfo=False): try: import xml.etree.cElementTree as ET except ImportError: import cElementTree as ET import sys, gzip if type(filename) == types.StringType: print >> sys.stderr, "Loading corpus file", filename corpusTree = ETUtils.ETFromObj(filename) corpusRoot = corpusTree.getroot() return CorpusElements(corpusRoot, parse, tokenization, removeIntersentenceInteractions, corpusTree, removeNameInfo)
def getTriggers(corpus): """ Returns a dictionary of "entity type"->"entity text"->"count" """ corpus = ETUtils.ETFromObj(corpus) trigDict = {} for entity in corpus.getroot().getiterator("entity"): if entity.get("given") == "True": continue eType = entity.get("type") if not trigDict.has_key(eType): trigDict[eType] = {} eText = entity.get("text") eText = PorterStemmer.stem(eText) if not trigDict[eType].has_key(eText): trigDict[eType][eText] = 0 trigDict[eType][eText] += 1 return trigDict
def mergeSets(input, corpusDir=None, output=None, allowNone=False): # Find the files if isinstance(input, dict): filenames = [{"path":input[x], "set":x} for x in input] else: if corpusDir == None: if os.path.dirname(input): corpusDir = os.path.dirname(input) input = os.path.basename(input) else: corpusDir = os.path.normpath(Settings.DATAPATH + "/corpora") print >> sys.stderr, "Searching for corpus files at " + corpusDir + " using pattern " + input filenames = [{"path":os.path.join(corpusDir, x), "set":None} for x in getMatchingFiles(input, corpusDir)] # Merge the files print >> sys.stderr, "Merging input files", filenames if len(filenames) == 0: if allowNone: print >> sys.stderr, "Nothing to merge" return else: raise Exception("No input files found for merging") newRoot = None counts = defaultdict(int) for filename in filenames: print >> sys.stderr, "Merging file", filename["path"] xml = ETUtils.ETFromObj(filename["path"]).getroot() if newRoot == None: newRoot = ET.Element("corpus", xml.attrib) else: assert newRoot.attrib == xml.attrib for doc in xml.iter("document"): assert doc.get("set") != None, doc.attrib if filename["set"] != None: assert filename["set"] == doc.get("set") counts["set=" + doc.get("set")] += 1 counts["set(" + filename["path"] + ")=" + doc.get("set")] += 1 for element in xml: newRoot.append(element) print >> sys.stderr, dict(counts) if output != None: print "Writing merged corpus to", output ETUtils.write(newRoot, output) return ET.ElementTree(newRoot)
def mixSets(input, output, docOrigIds, sourceSet, targetSet): print >> sys.stderr, "Mixing Sets", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if docOrigIds != None: for document in corpusRoot.getiterator("document"): docId = document.get("pmid") if docId == None: docId = document.get("origId") if docId in docOrigIds: assert document.get("set") == sourceSet document.set("set", targetSet) docOrigIds.remove(docId) assert len(docOrigIds) == 0, docOrigIds sentenceIds = None if sentenceIds != None: for document in corpusRoot.getiterator("document"): removed = [] for sentence in document.findall("sentence"): assert document.get("set") == sourceSet sentenceId = sentence.get("id") if sentenceId in sentenceIds: removed.append(document.remove(sentence)) sentenceIds.remove(sentenceId) if len(removed) > 0: newDoc = ET.Element("document") for attr in document.attrib: newDoc.set(attr, document.get(attr)) newDoc.set("id", None) newDoc.set("set", targetSet) for sentence in removed: newDoc.append(sentence) corpusRoot.append(newDoc) assert len(sentenceIds) == None RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processCorpus(input, output, rules): print >> sys.stderr, "Deleting attributes, rules =", rules print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() countsByType = {} for key in sorted(rules.keys()): for attribute in rules[key]: countsByType[key + ":" + attribute] = 0 removeAttributes(corpusRoot, key, rules[key], countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processCorpus(input, output, rules): if rules == None: raise Exception("No mapping rules defined") elif isinstance(rules, basestring): rules = eval(rules) print >> sys.stderr, "Mapping attributes, rules =", rules print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for key in sorted(rules.keys()): mapAttributes(corpusRoot, key, rules[key], counts) print >> sys.stderr, "Mapped", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def parseStats(input): print >> sys.stderr, "Loading input file", input inputTree = ETUtils.ETFromObj(input) inputRoot = inputTree.getroot() counts = defaultdict(int) for sentence in inputRoot.getiterator("sentence"): counts["sentence"] += 1 analysesElement = sentence.find("sentenceanalyses") if analysesElement == None: counts["sentence-no-analyses"] += 1 continue # Create parses element (if needed) parsesElement = analysesElement.find("parses") if parsesElement == None: counts["sentence-no-parses"] += 1 continue # Loop through parses for parseElement in parsesElement: parserName = parseElement.get("parser") counts["parse:" + parserName] += 1 if parseElement.get("pennstring") in ["", None]: counts["parse:" + parserName + "(no penn)"] += 1 if len(parseElement.findall("dependency")) == 0: counts["parse:" + parserName + "(no dependencies)"] += 1 if len(parseElement.findall("phrase")) == 0: counts["parse:" + parserName + "(no phrases)"] += 1 # Tokenizations tokenizationsElement = analysesElement.find("tokenizations") if tokenizationsElement == None: counts["sentence-no-tokenizations"] += 1 continue # Loop through tokenizations for tokenizationElement in tokenizationsElement: tokenizerName = tokenizationElement.get("tokenizer") counts["tokenization:" + tokenizerName] += 1 if len(tokenizationElement.findall("token")) == 0: counts["tokenization:" + tokenizerName + "(no tokens)"] += 1 print >> sys.stderr, "Parse statistics for", input for key in sorted(counts.keys()): print >> sys.stderr, " ", key + ":", counts[key]
def visualize(inPath, outPath, sentId, parseName): setSVGOptions() xml = ETUtils.ETFromObj(inPath) sentences = {x.get("id"):x for x in xml.iter("sentence")} if sentId not in sentences: print >> sys.stderr, "Sentence id '" + sentId + "' not found" return sentence = sentences[sentId] parse = IXMLUtils.getParseElement(sentence, parseName) if not parse: print >> sys.stderr, "Sentence has no parse with name '" + parseName + "'" return tokenization = IXMLUtils.getTokenizationElement(sentence, parse.get("tokenizer")) graph = SentenceGraph(sentence, [x for x in tokenization.findall("token")], [x for x in parse.findall("dependency")]) graph.mapInteractions([x for x in sentence.findall("entity")], [x for x in sentence.findall("interaction")]) svgTokens = tokensToSVG(tokenization.findall("token")) svgEdges = edgesToSVG(svgTokens, graph) #writeSVG({x.id:x for x in svgTokens}, svgEdges, outPath) writeSVG(svgTokens, svgEdges, outPath)
def catenateElements(inputs, inputDir): print >> sys.stderr, "##### Catenate interaction XML as elements #####" output = {} for dataSet in ("devel", "train"): root = ET.Element("corpus", {"source": ",".join(inputs)}) tree = ET.ElementTree(root) print "Processing corpus dataset", dataSet output[dataSet] = tree for input in inputs: corpusPath = os.path.join(inputDir, input + "-" + dataSet + ".xml") print >> sys.stderr, "Catenating", corpusPath if not os.path.exists(corpusPath): print "Input", corpusPath, "not found" continue xml = ETUtils.ETFromObj(corpusPath) for document in xml.getiterator("document"): root.append(document) RecalculateIds.recalculateIds(tree) return output
def findHeads(corpus, stringsFrom, methods, parse, tokenization): for m in methods: assert m in ["REMOVE", "SYNTAX", "DICT"] corpus = ETUtils.ETFromObj(corpus) counts = {} for method in methods: print >> sys.stderr, method, "pass" if method == "REMOVE": counts[method] = removeHeads(corpus) elif method == "DICT": counts[method] = findHeadsDictionary(corpus, stringsFrom, parse, tokenization) elif method == "SYNTAX": counts[method] = findHeadsSyntactic(corpus, parse, tokenization) print >> sys.stderr, method, "pass added", counts[method][ 0], "and removed", counts[method][1], "heads" print >> sys.stderr, "Summary (pass/added/removed):" for method in methods: print >> sys.stderr, " ", method, "/", counts[method][0], "/", counts[ method][1]
def toSTFormat(input, output=None, outputTag="a2", useOrigIds=False, debug=False, skipArgs=[], validate=True, writeExtra=False, allAsRelations=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() nonEntitySiteCount = 0 documents = [] for document in corpusRoot.findall("document"): stDoc = Document() stDoc.id = document.get("pmid") if stDoc.id == None: stDoc.id = document.get("origId") addTextToSTDoc(stDoc, document) documents.append(stDoc) eMap = {} tMap = {} entityElementMap = {} # for task 3 addEntitiesToSTDoc(stDoc, document, tMap, eMap, entityElementMap, useOrigIds) addInteractionsToSTDoc(stDoc, document, tMap, eMap, entityElementMap, skipArgs, allAsRelations) if output != None: print >> sys.stderr, "Writing output to", output writeSet(documents, output, resultFileTag=outputTag, debug=debug, writeExtra=writeExtra) return documents
def splitMergedElements(inputFilename, outputFilename=None): print >> sys.stderr, "##### Split elements with merged types #####" print >> sys.stderr, "Loading corpus", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]} for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, countsByType) print >> sys.stderr, "Results" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ": removed", countsByType[k][ 0], "created", countsByType[k][1] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def convertAndEvaluate(xml, task, a2Tag, goldDir=None, debug=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(xml) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() if task == None: task = corpusRoot.get("source") print >> sys.stderr, "BioNLP Shared Task evaluation for task", task tempdir = tempfile.mkdtemp() subTasks = None if "." in task: mainTask, subTasks = task.split(".") subTasks = [int(x) for x in subTasks] if subTasks != None: for subTask in subTasks: skipArgs = [] skipModifiers = False if subTask == 1: skipArgs = ['AtLoc', 'ToLoc', 'Site'] skipModifiers = True elif subTask == 2: skipModifiers = True print >> sys.stderr, "---------------", "Converting task", task, "corpus for GENIA sub task", subTask, "---------------" print >> sys.stderr, "Skipping arguments:", skipArgs, " Skipping modifiers:", skipModifiers outDir = os.path.join(tempdir, "events-" + str(subTask)) Utils.STFormat.ConvertXML.toSTFormat(corpusTree, outDir, outputTag=a2Tag, skipArgs=skipArgs, skipModifiers=skipModifiers) evaluate(outDir, mainTask + "." + str(subTask), goldDir, debug) else: outDir = os.path.join(tempdir, "events") print >> sys.stderr, "---------------", "Converting task", task, "corpus", "---------------" Utils.STFormat.ConvertXML.toSTFormat(xml, outDir, outputTag=a2Tag) evaluate(outDir, task, goldDir, debug) shutil.rmtree(tempdir) return xml
def loadDrugBank(filename, preTag="{http://drugbank.ca}", verbose=False): data = defaultdict(lambda : defaultdict(list)) print "Loading DrugBank XML" xml = ETUtils.ETFromObj(filename) print "Processing DrugBank XML" root = xml.getroot() assert root.tag == preTag+"drugs", root.tag for drug in root.findall(preTag+"drug"): id = drug.find(preTag+"drugbank-id").text name = drug.find(preTag+"name").text if verbose: print id, name assert id not in data data[id]["name"] = name # TODO: Enzymes & targets # TODO: hydrophobicity getNestedItems(drug, "synonym", data[id], preTag) getNestedItems(drug, "brand", data[id], preTag) getNestedItems(drug, "group", data[id], preTag) getNestedItems(drug, "category", data[id], preTag, "categories") interactions = drug.find(preTag+"drug-interactions").findall(preTag+"drug-interaction") for interaction in interactions: data[id]["interaction"].append( [interaction.find(preTag+"drug").text, None, interaction.find(preTag+"description").text,] ) return data
def getHeads(corpus): corpus = ETUtils.ETFromObj(corpus) headDict = {} headDict["None"] = {} for sentence in corpus.getiterator("sentence"): headOffsetStrings = set() for entity in sentence.findall("entity"): eType = entity.get("type") if not headDict.has_key(eType): headDict[eType] = {} eText = entity.get("text") headOffset = entity.get("headOffset") headOffsetStrings.add(headOffset) headOffset = Range.charOffsetToSingleTuple(headOffset) charOffset = Range.charOffsetToSingleTuple( entity.get("charOffset")) if headOffset == charOffset: if not headDict[eType].has_key(eText): headDict[eType][eText] = 0 headDict[eType][eText] += 1 else: headText = sentenceText[headOffset[0] - charOffset[0]:headOffset[1] - charOffset[0] + 1] if not headDict[eType].has_key(headText): headDict[eType][headText] = 0 headDict[eType][headText] += 1 for token in tokens: if not token.get( "charOffset" ) in headOffsetStrings: # token is not the head of any entity headText = token.get("text") if not headDict["None"].has_key(headText): headDict["None"][headText] = 0 headDict["None"][headText] += 1 return headDict
def analyze(self, inputs, model=None, verbose=False): self._init() if type(inputs) in types.StringTypes or not isinstance( inputs, collections.Sequence): inputs = [inputs] for xml in inputs: print >> sys.stderr, "Analyzing", xml xml = ETUtils.ETFromObj(xml) for document in xml.getiterator("document"): # Collect elements into dictionaries entityById = {} for entity in document.getiterator("entity"): entityById[entity.get("id")] = entity interactions = [] interactionsByE1 = defaultdict(list) for interaction in document.getiterator("interaction"): interactions.append(interaction) interactionsByE1[interaction.get("e1")].append(interaction) siteOfTypes = self.buildSiteOfMap(interactions, interactionsByE1, entityById) # Add entity elements to analysis for entity in document.getiterator("entity"): self.addEntityElement(entity, interactionsByE1) # Add interaction elements to analysis for interaction in interactions: self.addInteractionElement(interaction, entityById, siteOfTypes[interaction]) # Calculate event definition argument limits from event instances for event in self.events.values(): event.countArguments() self._updateSupportingAnalyses() if verbose: print >> sys.stderr, self.toString() if model != None: self.save(model)
def makeSubset(input, output=None, ratio=1.0, seed=0, invert=False): if ratio == 1.0: if output != None: shutil.copy2(input, output) return output else: return input print >> sys.stderr, "====== Making subset ======" print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed xml = ETUtils.ETFromObj(input).getroot() count = 0 sentCount = 0 for document in xml.findall("document"): sentCount += len(document.findall("sentence")) count += 1 totalFolds = min(100, count) selectedFolds = int(ratio * min(100, count)) division = Core.Split.getFolds(count, totalFolds, seed) #print division, selectedFolds - 1 index = 0 removeCount = 0 sentRemoveCount = 0 for document in xml.findall("document"): removal = division[index] > selectedFolds - 1 if invert: removal = not removal if removal: xml.remove(document) sentRemoveCount += len(document.findall("sentence")) removeCount += 1 index += 1 print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount xml.set("subsetRatio", str(ratio)) xml.set("subsetSeed", str(seed)) if output != None: ETUtils.write(xml, output) return output
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update( 1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None, structureAnalyzer=None): """ Writes task 3 examples to interaction XML. Assumes task 3 classification is done with SVMMulticlass Classifier, used for two classes. """ print >> sys.stderr, "Adding task 3 to Interaction XML" examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Determine subtask task3Type = None for example in examples: assert example[3].has_key("t3type") task3Type = example[3]["t3type"] break if task3Type == None: if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree assert task3Type in ["multiclass", "speculation", "negation"] # Remove the task 3 subtask information if it already exists for entity in corpusRoot.getiterator("entity"): if task3Type == "multiclass": entity.set("speculation", "False") entity.set("negation", "False") elif task3Type == "speculation": entity.set("speculation", "False") else: # task3Type == "negation" entity.set("negation", "False") specMap = {} negMap = {} for example, prediction in itertools.izip(examples, predictions): assert example[3]["xtype"] == "task3" if example[3]["t3type"] == "multiclass": if isinstance(prediction, dict): encoded = prediction["prediction"] predictedModifiers = [ classSet.getName(i) for i in range(len(encoded)) if encoded[i] == 1 ] else: predictedClassName = classSet.getName(prediction[0]) predictedModifiers = "" if predictedClassName != "neg": predictedModifiers = predictedClassName.split("---") if "negation" in predictedModifiers: assert not negMap.has_key(example[3]["entity"]) negMap[example[3]["entity"]] = (True, prediction) if "speculation" in predictedModifiers: assert not specMap.has_key(example[3]["entity"]) specMap[example[3]["entity"]] = (True, prediction) else: if example[3]["t3type"] == "speculation": map = specMap else: map = negMap if prediction[0] != 1: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (True, prediction) else: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (False, prediction) for entity in corpusRoot.getiterator("entity"): eId = entity.get("id") if task3Type == "multiclass": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds)) if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds)) else: if task3Type == "speculation": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "specConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds, [""])) elif task3Type == "negation": if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "negConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds, ["", "speculation"])) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree
optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="Debug mode") optparser.add_option("-v", "--validate", default=None, dest="validate", help="validate input", metavar="FILE") (options, args) = optparser.parse_args() s = StructureAnalyzer() if options.load: s.load(None, options.input) else: s.analyze(options.input.split(",")) print >> sys.stderr, "--- Structure Analysis ----" print >> sys.stderr, s.toString() if options.validate != None: print >> sys.stderr, "--- Validation ----" xml = ETUtils.ETFromObj(options.validate) s.validate(xml, simulation=False, debug=options.debug) if options.output != None: ETUtils.write(xml, options.output) elif options.output != None: print >> sys.stderr, "Structure analysis saved to", options.output s.save(None, options.output)
def processCorpus(input, parserName): print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counts = defaultdict(int) matchByType = defaultdict(lambda: [0, 0]) filteredMatchByType = defaultdict(lambda: [0, 0]) filter = set( ["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"]) # # fix spans # for document in documents: # for sentence in document.findall("sentence"): # sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # for entity in sentence.findall("entity"): # altOffsetString = entity.get("altOffset") # if altOffsetString == None: # continue # #print altOffsetString # altOffsets = Range.charOffsetToTuples(altOffsetString) # assert len(altOffsets) == 1 # for i in range(len(altOffsets)): # altOffset = altOffsets[i] # altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) # entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) #counter = ProgressCounter(len(documents), "Documents") for document in documents: for sentence in document.findall("sentence"): entities = sentence.findall("entity") parse = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "parse", {"parser": parserName}) if parse == None: continue tokenization = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "tokenization", {"tokenizer": parse.get("tokenizer")}) phrases, phraseDict = makePhrases(parse, tokenization, entities) phraseOffsets = phraseDict.keys() #phraseOffsets.sort() phraseNECounts = getNECounts(phrases, entities) for value in phraseDict.values(): counts["phrases"] += len(value) for phrase in value: matchByType[phrase.get("type")][0] += 1 if phrase.get("type") in filter: filteredMatchByType[phrase.get("type")][0] += 1 counts["phrases-filtered"] += 1 if phrase.get("type").find("NP") != -1: matchByType[phrase.get("type") + "_NE" + str(phraseNECounts[phrase])][0] += 1 counts["tokens"] += len(tokenization.findall("token")) corefType = {} for interaction in sentence.findall("interaction"): if interaction.get("type") == "Coref": corefType[interaction.get("e1")] = "Anaphora" corefType[interaction.get("e2")] = "Antecedent" for entity in entities: if entity.get("given") == "True": continue counts["entity"] += 1 print "entity", entity.get("id") print ETUtils.toStr(entity) matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) count = 0 filteredCount = 0 for phrase in matches: cType = "UNKNOWN" if corefType.has_key(entity.get("id")): cType = corefType[entity.get("id")] print " match", count, ETUtils.toStr(phrase), "NE" + str( phraseNECounts[phrase] ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity) count += 1 matchByType[phrase.get("type")][1] += 1 matchByType[phrase.get("type") + "_" + cType][1] += 1 matchByType[phrase.get("type") + "_" + cType + "_NE" + str(phraseNECounts[phrase])][1] += 1 if phrase.get("type") in filter: filteredCount += 1 filteredMatchByType[phrase.get("type")][1] += 1 # Matching if count == 0: print " NO MATCH", ETUtils.toStr(entity) counts["no-match"] += 1 else: counts["match"] += 1 # Multimatching if len(matches) > 1: bestMatch = selectBestMatch(entity, matches) print " MULTIMATCH(" + entity.get( "charOffset" ) + "," + str(entity.get("altOffset")) + ")", ", ".join([ x.get("type") + "_" + x.get("charOffset") for x in matches ]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get( "charOffset") + ")" # Filtered matching if filteredCount == 0: counts["no-match-filtered"] += 1 else: counts["match-filtered"] += 1 print "Match" for key in sorted(matchByType.keys()): print " ", key, " ", matchByType[key] print "Filtered", filteredMatchByType print "Counts", counts
def mainFunc(input, output=None, parseName="McCC", tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True): print >> sys.stderr, "Protein Name Splitter" if logFileName != None: print >> sys.stderr, "Writing log to", logFileName logFile = open(logFileName, "wt") else: logFile = None #if input.endswith(".gz"): # inFile = gzip.GzipFile(input) #else: # inFile = open(input) tree = ETUtils.ETFromObj(input) if tokenizationName == None: tokenizationName = parseName #tree = ElementTree.parse(inFile) root = tree.getroot() sentences = [x for x in root.getiterator("sentence")] counter = ProgressCounter(len(sentences), "Split Protein Names") counter.showMilliseconds = True missingTokCount = 0 for sentence in sentences: sId = sentence.get("id") counter.update(1, "Splitting names (" + sId + "): ") tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld) if tok == None: missingTokCount += 1 continue assert tok is not None, "Missing tokenization '%s' in sentence %s!" % ( tokenizationName, sId) parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld) assert parse is not None, "Missing parse '%s' in sentence %s!" % ( parseName, sId) split = splitTokens(tok, sentence, logFile) # Default names if removeOld: if newTokenizationName == None: newTokenizationName = tok.get("tokenizer") if newParseName == None: newParseName = parse.get("parser") else: if newTokenizationName == None: newTokenizationName = "split-" + tok.get("tokenizer") if newParseName == None: newParseName = "split-" + parse.get("parser") # add a new tokenization with the split tokens. splittok = addTokenization(newTokenizationName, sentence, sId) addTokensToTree(split, splittok) for a in tok.attrib: if splittok.get(a) == None: splittok.set(a, tok.get(a)) #splittok.set("split-") # make a mapping from original to split token ids. Store the # head token when given. tokenIdMap = {} for t in split: if t.head: head = t.head # traverse while head.head is not None: assert head.head != t, "Cyclic heads" head = head.head # should match (nah, punctuation problems) # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict" tokenIdMap[t.origId] = head.id else: # only allow overwrite of existing entry if the current token # is not punctuation. if t.origId not in tokenIdMap or not t.isPunct(): tokenIdMap[t.origId] = t.id # make a copy of the specified parse that refers to the split tokens # instead of the originals. newparse = addParse(newParseName, newTokenizationName, sentence, sId) for a in parse.attrib: if newparse.get(a) == None: newparse.set(a, parse.get(a)) newparse.set("ProteinNameSplitter", "True") splittok.set("ProteinNameSplitter", "True") depSeqId = 0 #1 for d in parse.getiterator("dependency"): t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type") assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR" dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", tokenIdMap[t1]) dep.set("t2", tokenIdMap[t2]) dep.set("type", dType) dep.set("id", "sd_%d" % depSeqId) depSeqId += 1 # Add in new dependencies between the split parts. for t in [tok for tok in split if tok.head is not None]: dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", t.head.id) dep.set("t2", t.id) dep.set("type", t.depType) dep.set("split", "PNS") dep.set("id", "spd_%d" % depSeqId) depSeqId += 1 for phrase in parse.getiterator("phrase"): newparse.append(phrase) # debugging #print >> sys.stderr, "NEW DEP IN", sId print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences" #indent(root) if logFile != None: logFile.close() # debugging if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(tree, output) return tree
def mergeSentences(input, output, verbose=False): print >> sys.stderr, "Merging sentences into documents" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): counts["documents"] += 1 # Check that the entity has only sentence elements as children children = [x for x in document] docChildTypes = sorted(set([x.tag for x in children])) if len(docChildTypes) == 0: counts["documents-with-no-sentences"] += 1 continue elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence": raise Exception("Document '" + str(document.get("id")) + "' has non-sentence children: " + str(docChildTypes)) # Process all the child sentence elements docId = document.get("id") interactions = [] entities = [] entityById = {} interactionById = {} combinedText = "" calculatedOffset = (0, 0) for sentence in children: document.remove(sentence) sentenceText = sentence.get("head", "") + sentence.get( "text", "") + sentence.get("tail", "") sentOffset = sentence.get("charOffset") if sentence == children[0]: noDefinedOffsets = sentOffset == None elif (sentOffset == None) != noDefinedOffsets: raise Exception("Only some sentences in document '" + docId + "' have defined offsets") if sentOffset == None: if sentence != children[-1]: sentenceText = sentenceText + " " calculatedOffset = (calculatedOffset[1], calculatedOffset[1] + len(sentenceText)) sentOffset = calculatedOffset else: sentOffset = Range.charOffsetToSingleTuple(sentOffset) combinedText += sentenceText # Collect and update the entity elements for entity in sentence.findall("entity"): # Map sentence-level entity offsets to document level for offsetKey in ("charOffset", "headOffset"): if entity.get(offsetKey) != None: offset = Range.charOffsetToTuples( entity.get(offsetKey)) for i in range(len(offset)): offset[i] = (offset[i][0] + sentOffset[0], offset[i][1] + sentOffset[0]) entity.set(offsetKey, Range.tuplesToCharOffset(offset)) # Compare mapped offsets to origOffset, if available if entity.get("origOffset") != None: if entity.get("charOffset") != entity.get("origOffset"): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' new charOffset differs from origOffset: " + str([ entity.get("charOffset"), entity.get("origOffset") ])) counts["checked-origOffsets"] += 1 del entity.attrib["origOffset"] assert entity.get("id") not in entityById entityById[entity.get( "id" )] = entity # For re-mapping the interaction 'e1' and 'e2' attributes entities.append(entity) counts["moved-entities"] += 1 # Collect and update the interaction elements for interaction in sentence.findall("interaction"): assert interaction.get("id") not in interactionById interactionById[interaction.get( "id" )] = interaction # For re-mapping the interaction 'siteOf' attributes interactions.append(interaction) counts["moved-interactions"] += 1 # Check that the combined sentence text matches the document text, if available if document.get("text") != None and document.get( "text") != combinedText: if combinedText == document.get( "text")[0:len(combinedText)] and document.get( "text")[len(combinedText):].strip() == "": if verbose: print >> sys.stderr, "Warning, document '" + document.get( "id" ) + "' text has trailing whitespace not included in the combined sentence text" combinedText = document.get("text") counts["missing-trailing-whitespace"] += 1 else: raise Exception( "Document '" + str(document.get("id")) + "' text differs from combined sentence text: " + str([document.get("text"), combinedText])) counts["checked-document-texts"] += 1 # Check that the entities' texts match the document text for entity in entities: offset = Range.charOffsetToTuples(entity.get("charOffset")) if len(offset) == 1: # Compare only continous entities if not Range.contains((0, len(combinedText)), offset[0]): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' offset is not contained in combined sentence text: " + str([ entity.attrib, offset, [0, len(combinedText)], combinedText ])) combTextSpan = combinedText[offset[0][0]:offset[0][1]] if entity.get("text") != combTextSpan: raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' text does not match combined sentence text: " + str([entity.get("text"), combTextSpan])) counts["checked-charOffsets"] += 1 # Set the combined text as the document text document.set("text", combinedText) # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping) for i in range(len(entities)): entities[i].set("id", docId + ".e" + str(i)) # Update the id for the document level for i in range(len(interactions)): interaction.set("id", docId + ".i" + str(i)) # Update the id for the document level # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences) for i in range(len(interactions)): interaction = interactions[i] for entKey in ("e1", "e2"): interaction.set(entKey, entityById[interaction.get(entKey)].get("id")) if interaction.get("siteOf") != None: interaction.set( "siteOf", interactionById[interaction.get("siteOf")].get("id")) # Add the entity and interaction elements to the document document.extend(entities) document.extend(interactions) print >> sys.stderr, "Counts:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def run(input, output=None, elementName="entity", processElement="document", splitNewlines=False, debug=False, bannerPath=None, trovePath=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() # Write text to input file workdir = tempfile.mkdtemp() if debug: print >> sys.stderr, "BANNER work directory at", workdir infile = codecs.open(os.path.join(workdir, "input.txt"), "wt", "utf-8") idCount = 0 for sentence in corpusRoot.getiterator(processElement): infile.write( "U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n") idCount += 1 infile.close() # Define classpath for java if bannerPath == None: bannerPath = Settings.BANNER_DIR libPath = "/lib/" # if not os.path.exists(bannerPath + libPath): # libPath = "/libs/" # assert os.path.exists(bannerPath + libPath) assert os.path.exists(bannerPath + libPath + "banner.jar"), bannerPath oldVersion = True classPath = bannerPath + "/bin" for filename in os.listdir(bannerPath + libPath): #if filename.endswith(".jar"): # classPath += ":" + bannerPath + libPath + filename if filename == "uima": oldVersion = False classPath += ":" + bannerPath + libPath + "*" # classPath += ":" + bannerPath + libPath + "banner.jar" # classPath += ":" + bannerPath + libPath + "dragontool.jar" # classPath += ":" + bannerPath + libPath + "heptag.jar" # classPath += ":" + bannerPath + libPath + "commons-collections-3.2.1.jar" # classPath += ":" + bannerPath + libPath + "commons-configuration-1.6.jar" # classPath += ":" + bannerPath + libPath + "commons-lang-2.4.jar" # classPath += ":" + bannerPath + libPath + "mallet.jar" # classPath += ":" + bannerPath + libPath + "commons-logging-1.1.1.jar" if oldVersion: if trovePath == None: trovePath = Settings.JAVA_TROVE_PATH assert os.path.exists(trovePath), trovePath classPath += ":" + trovePath # ":/usr/share/java/trove.jar" print >> sys.stderr, "Trove library at", trovePath config = makeConfigXML(workdir, bannerPath, oldVersion) # Run parser print >> sys.stderr, "Running BANNER", bannerPath cwd = os.getcwd() os.chdir(bannerPath) if oldVersion: # old version args = Settings.JAVA.split() + [ "-cp", classPath, "banner.eval.TestModel", config ] else: args = Settings.JAVA.split() + [ "-cp", classPath, "banner.eval.BANNER", "test", config ] print >> sys.stderr, "BANNER command:", " ".join(args) startTime = time.time() exitCode = subprocess.call(args) assert exitCode == 0, exitCode print >> sys.stderr, "BANNER time:", str( datetime.timedelta(seconds=time.time() - startTime)) os.chdir(cwd) # Put sentences in dictionary sDict = {} sentenceHasEntities = {} sCount = 0 for sentence in corpusRoot.getiterator(processElement): sDict["U" + str(sCount)] = sentence sentenceHasEntities["U" + str(sCount)] = False sCount += 1 sentencesWithEntities = 0 totalEntities = 0 nonSplitCount = 0 splitEventCount = 0 # TODO: mention.txt appears to contain predicted entities directly # To be able to feed BANNER documents (or poorly chopped sentences) # one should probably remove newlines, as BANNER separates its input # on newlines. Replacing all \r and \n characters should preserve the # character offsets. # Read BANNER results print >> sys.stderr, "Inserting entities" if oldVersion: outfile = codecs.open(os.path.join(workdir, "output.txt"), "rt", "utf-8") idfile = codecs.open(os.path.join(workdir, "ids.txt"), "rt", "utf-8") # Add output to sentences for line in outfile: bannerId = idfile.readline().strip() sentence = sDict[bannerId] # Find or create container elements sentenceId = sentence.get("id") sText = sentence.get("text") start = 0 entityCount = 0 beginOffset = None # Add tokens splits = line.strip().split() for split in splits: tokenText, tag = split.rsplit("|", 1) # Determine offsets by aligning BANNER-generated tokens to original text cStart = sText.find(tokenText, start) assert cStart != -1, (tokenText, tag, sText, line) cEnd = cStart + len(tokenText) - 1 start = cStart + len(tokenText) if tag == "O": if beginOffset != None: ## Make element #ent = ET.Element(elementName) #ent.set("id", sentenceId + ".e" + str(entityCount)) #ent.set("charOffset", str(beginOffset) + "-" + str(prevEnd)) #ent.set("type", "Protein") #ent.set("given", "True") #ent.set("source", "BANNER") #ent.set("text", sText[beginOffset:prevEnd+1]) entities = makeEntityElements(beginOffset, prevEnd, sText, splitNewlines, elementName) assert len(entities) > 0 nonSplitCount += 1 if len(entities) > 1: splitEventCount += 1 for ent in entities: ent.set("id", sentenceId + ".e" + str(entityCount)) sentence.append(ent) if not sentenceHasEntities[bannerId]: sentencesWithEntities += 1 sentenceHasEntities[bannerId] = True totalEntities += 1 entityCount += 1 beginOffset = None else: if beginOffset == None: beginOffset = cStart prevEnd = cEnd outfile.close() idfile.close() else: sentenceEntityCount = {} mentionfile = codecs.open(os.path.join(workdir, "mention.txt"), "rt", "utf-8") for line in mentionfile: bannerId, offsets, word = line.strip().split("|", 2) offsets = offsets.split() sentence = sDict[bannerId] map = getWhiteSpaceLessStringMap(sentence.get("text")) offsets[0], offsets[1] = fixWhiteSpaceLessOffset( word, sentence.get("text"), int(offsets[0]), int(offsets[1]), map) #offsets[0], offsets[1] = fixStrangeOffset(line.strip(), word, int(offsets[0]), int(offsets[1]), sentence.get("text")) entities = makeEntityElements(int(offsets[0]), int(offsets[1]), sentence.get("text"), splitNewlines, elementName) entityText = "\n".join([x.get("text") for x in entities]) assert entityText == word, (entityText, word, bannerId, offsets, sentence.get("id"), sentence.get("text")) assert len(entities) > 0, (line.strip(), sentence.get("text")) nonSplitCount += 1 if len(entities) > 1: splitEventCount += 1 if bannerId not in sentenceEntityCount: sentenceEntityCount[bannerId] = 0 for ent in entities: ent.set( "id", sentence.get("id") + ".e" + str(sentenceEntityCount[bannerId])) sentence.append(ent) if not sentenceHasEntities[bannerId]: sentencesWithEntities += 1 sentenceHasEntities[bannerId] = True totalEntities += 1 sentenceEntityCount[bannerId] += 1 mentionfile.close() print >> sys.stderr, "BANNER found", nonSplitCount, "entities in", sentencesWithEntities, processElement + "-elements", print >> sys.stderr, "(" + str(sCount) + " sentences processed)" print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "BANNER entities with newlines)" # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "BANNER working directory for debugging at", workdir if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree