def sdcToClassifier(self, sdc): """ Finds the classifier for an sdc. """ vIndexes, vTokens = chunker.tokenize(sdc.verb.text) srIndexes, srTokens = chunker.tokenize(sdc.spatialRelation.text) vTokens = [x.lower() for x in vTokens] srTokens = [x.lower() for x in srTokens] if "pass" in vTokens and "past" in self.tokenToEngine: return self.tokenToEngine["past"] else: for sp, engine in self.tokenToEngine.iteritems(): if sp in srTokens: if sp == "to" and sdc.spatialRelation.text.lower( ) != "to" and len(sdc.spatialRelation.text.split()) > 2: # if they say "with your back to" or "to the left of", don't use "to." continue else: return self.tokenToEngine[sp] if "stop" in vTokens: if not sdc.landmark.isNull() and "until" in self.tokenToEngine: return self.tokenToEngine["until"] # if "exit" in vTokens or "leave" in vTokens: # if not sdc.landmark.isNull(): # return self.tokenToEngine["out"] return None
def main(): dialog_fname = "sdc_annotations/stefie10.dialog.xml" dialogs = readDialogs(dialog_fname) figsize = (6, 4.5) bottomAdjust = 0.14 topAdjust = 0.9 histograms = { "landmark": Histogram(), "verb": Histogram(), "figure": Histogram(), "spatialRelation": Histogram() } for dialog in dialogs: for turn in dialog.turns: for sdc in turn.sdcs: indexes, allTokens = chunker.tokenize(sdc.text) movementWords = set(["go", "going", "walk", "walking"]) lowerTokens = [t.lower() for t in allTokens] if len(movementWords.intersection(lowerTokens)) != 0: print sdc.text for key in sdc.keys: if key in histograms: histogram = histograms[key] elif key[:-1] in histograms: histogram = histograms[key[:-1]] else: raise ValueError("No histogram for: " + ` key `) text = sdc.annotationMap[key].text indexes, tokens = chunker.tokenize(text) for t in tokens: histogram.add(t) for key, histogram in histograms.iteritems(): print "doing", key, histogram title = "%s" % key graphStacked({"all": histogram}, "histogram", title, maxCols=10, xticks=None, figsize=figsize, topAdjust=topAdjust, bottomAdjust=bottomAdjust) mpl.legend() mpl.show()
def gloss_map(self): gloss_dist = nltk.FreqDist() for i, word in enumerate(self.words): synset = self.synset(i) indexes, tokens = chunker.tokenize(synset.definition) for t in tokens: gloss_dist.inc(t) return gloss_dist
def writeTrainingForText(self, text, annotations, out): for sentenceStandoff in self.sentenceTokenizer.tokenize(text): indexes, tokens = tokenize(sentenceStandoff.text) tags = self.tagger.tag(tokens) for startIndex, (word, posTag) in zip(indexes, tags): startIndex = startIndex + sentenceStandoff.start wordRange = startIndex, startIndex + len(word) annotation, key = containingAnnotations(annotations, wordRange) if not (annotation is None): chunk = key else: chunk = "None" out.write("\t".join([word, posTag, chunk]) + "\n") out.write("\n")
def chunk(self, string): import CRFPP tagger = CRFPP.Tagger("-m " + self.modelFile) indexes, tokens = tokenize(string) tags = self.tagger.tag(tokens) tagger.clear() for word, posTag in tags: tagger.add(str("%s %s" % (word, posTag))) tagger.parse() for i, (index, token) in enumerate(zip(indexes, tokens)): label = tagger.y2(i) #print index, token, label labels = [tagger.y2(i) for i in range(len(tokens))] return indexes, tokens, labels
def score(groundTruthSessions, testSessions): tagger = makeTagger() cm = ConfusionMatrix() for groundTruth in groundTruthSessions: testSession = testSessions[groundTruth] for instructionIdx, instruction in enumerate( groundTruth.routeInstructions): groundTruthAnnotations = groundTruth.routeAnnotations[ instructionIdx] indexes, tokens = tokenize(instruction) print "tokens", tokens tags = tagger.tag(tokens) print " ".join(["%s/%s" % (word, tag) for word, tag in tags]) matchedIndexes = [False for g in groundTruthAnnotations] if len(groundTruthAnnotations) != 0: print "considering", groundTruth.key, "instruction", instructionIdx for testAnnotation in testSession.routeAnnotations[ instructionIdx]: idx, groundTruthMatch = findMatch(testAnnotation, groundTruthAnnotations, npMatch) if groundTruthMatch is None: print "fp", testAnnotation cm.FP += 1 else: print "tp", testAnnotation print "\tmatched", groundTruthMatch cm.TP += 1 matchedIndexes[idx] = True for i, hasMatch in enumerate(matchedIndexes): if not hasMatch: cm.FN += 1 print "fn", groundTruthAnnotations[i] #else: # what to do with true negatives print "precision", cm.precision print "recall", cm.recall print "f1", cm.f1
def spellcheck(str): from chunker import tokenize indexes, tokens = tokenize(str) result = "" first = True for idx, token in zip(indexes, tokens): if not first: result += " " if (not token in [",", ".", "?", "!"] and not checker.check(token)): suggestions = checker.suggest(token) if len(suggestions) != 0: result += checker.suggest(token)[0] else: result += token else: result += token first = False return result
def main(): corpus = Corpus("%s/data/verbs/corpus-11-2009.ods" % TKLIB_HOME) posTagger = chunker.makeTagger() print len(corpus.sessions), "subjects", len(corpus.commands), "commands" print len([x.command for x in corpus.commands if commandFilter(x.command)]), print "filtered commands" print na.mean([len([x for x in s.commands if commandFilter(x.command)]) for s in corpus.sessions]), "commands per subject" figsize=(6,10) bottomAdjust=0.07 topAdjust=0.9 histograms = {} commandTypes = corpus.commandTypes #commandTypes = ["Guiding people"] #commandTypes = ["Surveillance"] for commandType in commandTypes: histogram = Histogram() histograms[commandType] = histogram for command in corpus.commandsForType(commandType): indexes, tokens = chunker.tokenize(command.command) tokens = [t.lower() for t in tokens] tags = posTagger.tag(tokens) for token, tag in tags: if tag[0] == "V": histogram.add(token) graphStacked({"All":histogram}, "histogram", commandType, maxCols=50, xticks=None, figsize=figsize, topAdjust=topAdjust, bottomAdjust=bottomAdjust) graphStacked(histograms, "histogram", "All", maxCols=10, figsize=figsize, topAdjust=topAdjust, bottomAdjust=bottomAdjust) mpl.legend() mpl.show()
def testEmpty(self): tokens, indexes = chunker.tokenize(" ") self.assertEqual(len(tokens), 0) self.assertEqual(len(indexes), 0)