Python tokenizeの例、chunker.tokenize Pythonの例

コード例 #1

0

ファイルを表示

ファイル: spatialRelationClassifier.py プロジェクト: stefie10/slu_hri

    def sdcToClassifier(self, sdc):
        """
        Finds the classifier for an sdc. 
        """
        vIndexes, vTokens = chunker.tokenize(sdc.verb.text)
        srIndexes, srTokens = chunker.tokenize(sdc.spatialRelation.text)
        vTokens = [x.lower() for x in vTokens]
        srTokens = [x.lower() for x in srTokens]

        if "pass" in vTokens and "past" in self.tokenToEngine:
            return self.tokenToEngine["past"]
        else:
            for sp, engine in self.tokenToEngine.iteritems():
                if sp in srTokens:
                    if sp == "to" and sdc.spatialRelation.text.lower(
                    ) != "to" and len(sdc.spatialRelation.text.split()) > 2:
                        # if they say "with your back to" or "to the left of", don't use "to."
                        continue
                    else:
                        return self.tokenToEngine[sp]

        if "stop" in vTokens:
            if not sdc.landmark.isNull() and "until" in self.tokenToEngine:
                return self.tokenToEngine["until"]


#        if "exit" in vTokens or "leave" in vTokens:
#            if not sdc.landmark.isNull():
#                return self.tokenToEngine["out"]

        return None

コード例 #2

0

ファイルを表示

ファイル: histogram_dialog.py プロジェクト: stefie10/slu_hri

def main():
    dialog_fname = "sdc_annotations/stefie10.dialog.xml"

    dialogs = readDialogs(dialog_fname)

    figsize = (6, 4.5)
    bottomAdjust = 0.14
    topAdjust = 0.9

    histograms = {
        "landmark": Histogram(),
        "verb": Histogram(),
        "figure": Histogram(),
        "spatialRelation": Histogram()
    }

    for dialog in dialogs:
        for turn in dialog.turns:
            for sdc in turn.sdcs:
                indexes, allTokens = chunker.tokenize(sdc.text)
                movementWords = set(["go", "going", "walk", "walking"])
                lowerTokens = [t.lower() for t in allTokens]
                if len(movementWords.intersection(lowerTokens)) != 0:
                    print sdc.text
                for key in sdc.keys:

                    if key in histograms:
                        histogram = histograms[key]
                    elif key[:-1] in histograms:
                        histogram = histograms[key[:-1]]
                    else:
                        raise ValueError("No histogram for: " + ` key `)

                    text = sdc.annotationMap[key].text
                    indexes, tokens = chunker.tokenize(text)
                    for t in tokens:
                        histogram.add(t)

    for key, histogram in histograms.iteritems():
        print "doing", key, histogram
        title = "%s" % key
        graphStacked({"all": histogram},
                     "histogram",
                     title,
                     maxCols=10,
                     xticks=None,
                     figsize=figsize,
                     topAdjust=topAdjust,
                     bottomAdjust=bottomAdjust)
        mpl.legend()

    mpl.show()

コード例 #3

0

ファイルを表示

ファイル: annotation_reader.py プロジェクト: stefie10/slu_hri

 def gloss_map(self):
     gloss_dist = nltk.FreqDist()
     for i, word in enumerate(self.words):
         synset = self.synset(i)
         indexes, tokens = chunker.tokenize(synset.definition)
         for t in tokens:
             gloss_dist.inc(t)
     return gloss_dist

コード例 #4

0

ファイルを表示

ファイル: crfEntityExtractor.py プロジェクト: stefie10/slu_hri

    def writeTrainingForText(self, text, annotations, out):
        for sentenceStandoff in self.sentenceTokenizer.tokenize(text):
            indexes, tokens = tokenize(sentenceStandoff.text)
            tags = self.tagger.tag(tokens)

            for startIndex, (word, posTag) in zip(indexes, tags):
                startIndex = startIndex + sentenceStandoff.start
                wordRange = startIndex, startIndex + len(word)
                annotation, key = containingAnnotations(annotations, wordRange)
                if not (annotation is None):
                    chunk = key
                else:
                    chunk = "None"
                out.write("\t".join([word, posTag, chunk]) + "\n")
            out.write("\n")

コード例 #5

0

ファイルを表示

ファイル: crfEntityExtractor.py プロジェクト: stefie10/slu_hri

    def chunk(self, string):
        import CRFPP
        tagger = CRFPP.Tagger("-m " + self.modelFile)
        indexes, tokens = tokenize(string)
        tags = self.tagger.tag(tokens)
        tagger.clear()

        for word, posTag in tags:
            tagger.add(str("%s %s" % (word, posTag)))
        tagger.parse()
        for i, (index, token) in enumerate(zip(indexes, tokens)):
            label = tagger.y2(i)
            #print index, token, label
        labels = [tagger.y2(i) for i in range(len(tokens))]
        return indexes, tokens, labels

コード例 #6

0

ファイルを表示

ファイル: chunkerScorer.py プロジェクト: stefie10/slu_hri

def score(groundTruthSessions, testSessions):
    tagger = makeTagger()
    cm = ConfusionMatrix()

    for groundTruth in groundTruthSessions:
        testSession = testSessions[groundTruth]
        for instructionIdx, instruction in enumerate(
                groundTruth.routeInstructions):
            groundTruthAnnotations = groundTruth.routeAnnotations[
                instructionIdx]

            indexes, tokens = tokenize(instruction)
            print "tokens", tokens
            tags = tagger.tag(tokens)
            print " ".join(["%s/%s" % (word, tag) for word, tag in tags])

            matchedIndexes = [False for g in groundTruthAnnotations]
            if len(groundTruthAnnotations) != 0:
                print "considering", groundTruth.key, "instruction", instructionIdx
                for testAnnotation in testSession.routeAnnotations[
                        instructionIdx]:

                    idx, groundTruthMatch = findMatch(testAnnotation,
                                                      groundTruthAnnotations,
                                                      npMatch)

                    if groundTruthMatch is None:
                        print "fp", testAnnotation
                        cm.FP += 1
                    else:
                        print "tp", testAnnotation
                        print "\tmatched", groundTruthMatch
                        cm.TP += 1
                        matchedIndexes[idx] = True
                for i, hasMatch in enumerate(matchedIndexes):
                    if not hasMatch:
                        cm.FN += 1
                        print "fn", groundTruthAnnotations[i]
                    #else:
                    # what to do with true negatives

    print "precision", cm.precision
    print "recall", cm.recall
    print "f1", cm.f1

コード例 #7

0

ファイルを表示

ファイル: spellcheck.py プロジェクト: stefie10/slu_hri

def spellcheck(str):
    from chunker import tokenize
    indexes, tokens = tokenize(str)
    result = ""
    first = True
    for idx, token in zip(indexes, tokens):
        if not first:
            result += " "

        if (not token in [",", ".", "?", "!"] and not checker.check(token)):
            suggestions = checker.suggest(token)
            if len(suggestions) != 0:
                result += checker.suggest(token)[0]
            else:
                result += token
        else:
            result += token
        first = False
    return result

コード例 #8

0

ファイルを表示

def main():
    corpus = Corpus("%s/data/verbs/corpus-11-2009.ods" % TKLIB_HOME)
    posTagger = chunker.makeTagger()
    
    print len(corpus.sessions), "subjects", len(corpus.commands), "commands"
    print len([x.command for x in corpus.commands if commandFilter(x.command)]),
    print "filtered commands"
    print na.mean([len([x for x in s.commands if commandFilter(x.command)]) 
                   for s in corpus.sessions]), "commands per subject"

    figsize=(6,10)
    bottomAdjust=0.07
    topAdjust=0.9

    histograms = {}
    commandTypes = corpus.commandTypes
    #commandTypes = ["Guiding people"]
    #commandTypes = ["Surveillance"]
    for commandType in commandTypes:
        histogram = Histogram()
        histograms[commandType] = histogram
        for command in corpus.commandsForType(commandType):
            indexes, tokens = chunker.tokenize(command.command)
            tokens = [t.lower() for t in tokens]
            tags = posTagger.tag(tokens)        
            for token, tag in tags:
                if tag[0] == "V":
                    histogram.add(token)

        graphStacked({"All":histogram}, "histogram", commandType, maxCols=50,
                     xticks=None, figsize=figsize, topAdjust=topAdjust,
                     bottomAdjust=bottomAdjust)
                                      
    graphStacked(histograms, "histogram", "All", maxCols=10, figsize=figsize,
                 topAdjust=topAdjust, bottomAdjust=bottomAdjust)
    mpl.legend()
    mpl.show()

コード例 #9

0

ファイルを表示

 def testEmpty(self):
     tokens, indexes = chunker.tokenize("  ")
     self.assertEqual(len(tokens), 0)
     self.assertEqual(len(indexes), 0)