Ejemplo n.º 1
0
 def __init__(self, model_fname):
     self.newcf = CostFnCrf.from_mallet(
         model_fname,
         guiMode=True,
         feature_extractor_cls=kitchen_features.GGGFeatures)
     self.task_planner = nodeSearch.BeamSearch(self.newcf)
     self.sentence_tokenizer = SentenceTokenizer()
Ejemplo n.º 2
0
def chunkInstructions(fname):

    chunker = Chunker()
    sentenceTokenizer = SentenceTokenizer()
    sessions = readSession(fname, "regexp_chunker")
    for session in sessions:
        session.clearAnnotations()
        for instructionIdx, instruction in enumerate(
                session.routeInstructions):
            for sentenceStandoff in sentenceTokenizer.tokenize(
                    session, instructionIdx):
                offset = sentenceStandoff.start
                print "instruction", instruction
                indexes, tokens, chunks = chunker.chunk(sentenceStandoff.text)
                print "# of chunks", len(chunks)
                for i, c in enumerate(chunks.leaves()):
                    str, tag = c
                    range = Standoff(
                        session, instructionIdx,
                        (indexes[i] + offset, indexes[i] + len(str) + offset))
                    chunks[chunks.leaf_treeposition(i)] = (str, tag, range)

                semantics = addChunks(chunks, session, instructionIdx)

        session.saveAnnotations()
Ejemplo n.º 3
0
def pos_histograms(discourses, 
                   pos_tagger,
                   tag_groups={"Verbs":["VBZ", "VB"],
                               "Nouns":["NN", "NNS"],
                               "Prepositions":["IN", "TO", "UNTIL", "OF"],
                               "Adjectives":["JJ"],
                               }):

    stokenizer = SentenceTokenizer()
    tokenizer = IndexedTokenizer()
    tag_groups_to_words = collections.defaultdict(lambda : list())
    

    for discourse in discourses:
        for sentence_standoff in stokenizer.tokenize(discourse):
            tokens = tokenizer.tokenize(sentence_standoff.text)
            for key_tag, tag_group in tag_groups.iteritems():
                tags = pos_tagger.tag([t.text.lower() for t in tokens])
                for token, tag in tags:
                    if tag in tag_group:
                        tag_groups_to_words[key_tag].append(token)
                    tag_groups_to_words["all"].append(token)

    print "dumping counts"
    for pos, words in tag_groups_to_words.iteritems():
        print len(words), pos
        print len(set(words)), "unique", pos
        w_to_counts = collections.defaultdict(lambda : 0)
        for w in words:
            w_to_counts[w] += 1

        cnt_target = 10
        frequent_words = [(w, cnt) for w, cnt in w_to_counts.iteritems() 
                          if cnt > cnt_target]
        print len(frequent_words)
        print "if appeared more than %d times" % cnt_target
        print frequent_words

    print "done"
    return tag_groups_to_words

        
Ejemplo n.º 4
0
def compareAnnotations(groundTruthSessions, testSessions, keys=Annotation.keys):
    sentenceTokenizer = SentenceTokenizer()
    
    numMatches = 0.0
    total = 0.0
    
    for groundTruthSession, testSession in zip(groundTruthSessions, 
                                               testSessions):
        for instructionIdx, instruction in enumerate(testSession.routeInstructions):
            for sentenceStandoff in sentenceTokenizer.tokenize(testSession.routeInstructions[instructionIdx]):
                testAnnotations = annotationsInRange(testSession.routeAnnotations[instructionIdx],
                                                     sentenceStandoff)
                groundTruthAnnotations = annotationsInRange(groundTruthSession.routeAnnotations[instructionIdx],
                                                            sentenceStandoff)
                
                for testAnnotation in testAnnotations:
                    degreesOfOverlap = [testAnnotation.degreeOfOverlap(x) for x in groundTruthAnnotations]
                    matchIdx, matchValue = math2d.argMax(degreesOfOverlap)
                    if (not(matchIdx is None) and 
                        matches(groundTruthAnnotations[matchIdx], testAnnotation, keys)):
                        numMatches += 1
                    total += 1
    return numMatches / total
Ejemplo n.º 5
0
class IndexedTokenizer:
    def __init__(self, nltkTokenizer=PunktWordTokenizer()):
        self.nltkTokenizer = nltkTokenizer
        self.sentenceTokenizer = SentenceTokenizer()

    def tokenize(self, string):
        tokens = []
        for sentence in self.sentenceTokenizer.tokenize(string):
            word_tokens = self.word_tokenize(sentence.text)
            for w in word_tokens:
                correctStandoff(sentence, w)
            tokens.extend(word_tokens)
        return tokens

    def word_tokenize(self, string):
        """
        Tokenize a string, returning a tuple. The first element is a
        list of starting locations for the tokens and the second
        element is a list of tokens.
        """
        tokens = self.nltkTokenizer.tokenize(string)
        indexes = []
        startIdx = 0
        for token in tokens:
            idx = string.index(token, startIdx)
            indexes.append(idx)
            startIdx = idx + len(token)
        if len(tokens) > 0:
            lastToken = tokens[-1]
            if len(lastToken) > 1 and lastToken[-1] in ('?', '.', '!'):
                lastCharacter = lastToken[-1]
                tokens[-1] = lastToken[0:-1]
                tokens.append(lastCharacter)
                indexes.append(indexes[-1] + len(lastToken) - 1)

        return [
            TextStandoff(string, (i, i + len(token)))
            for i, token in zip(indexes, tokens)
        ]
Ejemplo n.º 6
0
 def __init__(self, modelFile):
     self.tagger = chunker.makeTagger()
     self.sentenceTokenizer = SentenceTokenizer()
     self.modelFile = modelFile
Ejemplo n.º 7
0
class CrfChunker:
    def __init__(self, modelFile):
        self.tagger = chunker.makeTagger()
        self.sentenceTokenizer = SentenceTokenizer()
        self.modelFile = modelFile

    def writeTrainingSession(self, session, instructionIdx, out):
        instruction = session.routeInstructions[instructionIdx]
        annotations = session.routeAnnotations[instructionIdx]

        self.writeTrainingForText(instruction, annotations, out)

    def writeTrainingForText(self, text, annotations, out):
        for sentenceStandoff in self.sentenceTokenizer.tokenize(text):
            indexes, tokens = tokenize(sentenceStandoff.text)
            tags = self.tagger.tag(tokens)

            for startIndex, (word, posTag) in zip(indexes, tags):
                startIndex = startIndex + sentenceStandoff.start
                wordRange = startIndex, startIndex + len(word)
                annotation, key = containingAnnotations(annotations, wordRange)
                if not (annotation is None):
                    chunk = key
                else:
                    chunk = "None"
                out.write("\t".join([word, posTag, chunk]) + "\n")
            out.write("\n")

    def writeTraining(self, sessions, outFileName):
        out = open(outFileName, "w")
        for session in sessions:
            for instructionIdx in range(len(session.routeInstructions)):
                #if session.subject == "Subject 17" and instructionIdx == 2:
                self.writeTrainingSession(session, instructionIdx, out)
                out.write("\n")
        out.close()

    def runTraining(self, templateFile, trainingFile, outputFile):
        sh(TKLIB_HOME + "/nlp/3rdParty/crf++/CRF++-0.53/crf_learn %s %s %s" %
           (templateFile, trainingFile, outputFile))

    def runTesting(self, modelFile, testingFile, outputFile):
        sh(TKLIB_HOME +
           "/nlp/3rdParty/crf++/CRF++-0.53/crf_test -m %s %s > %s" %
           (modelFile, testingFile, outputFile))

    def confusionMatrix(self, outputFile):
        cmKeys = Annotation.keys + ["None"]

        cm = ConfusionMatrix(cmKeys)
        baselineCm = ConfusionMatrix(cmKeys)
        for line in open(outputFile, "r"):
            tokens = line.split()
            if len(tokens) == 0:
                continue
            else:
                token = tokens[0]
                features = tokens[1:-3]
                trueLabel = tokens[-2]
                systemLabel = tokens[-1]

                #token, pos, trueLabel, systemLabel = tokens
                cm.increment(trueLabel, systemLabel)
                baselineCm.increment(trueLabel, "landmark")
        return cm, baselineCm

    def chunk(self, string):
        import CRFPP
        tagger = CRFPP.Tagger("-m " + self.modelFile)
        indexes, tokens = tokenize(string)
        tags = self.tagger.tag(tokens)
        tagger.clear()

        for word, posTag in tags:
            tagger.add(str("%s %s" % (word, posTag)))
        tagger.parse()
        for i, (index, token) in enumerate(zip(indexes, tokens)):
            label = tagger.y2(i)
            #print index, token, label
        labels = [tagger.y2(i) for i in range(len(tokens))]
        return indexes, tokens, labels
Ejemplo n.º 8
0
 def __init__(self, modelFile=TKLIB_HOME + "/nlp/data/out.model"):
     self.sentenceTokenizer = SentenceTokenizer()
     self.chunker = CrfChunker(modelFile)
Ejemplo n.º 9
0
class SdcExtractor:
    def __init__(self, modelFile=TKLIB_HOME + "/nlp/data/out.model"):
        self.sentenceTokenizer = SentenceTokenizer()
        self.chunker = CrfChunker(modelFile)

    def chunk(self, instructionTxt):
        try:
            return self.doChunk(instructionTxt)
        except:
            print instructionTxt
            print instructionTxt.__class__
            raise

    def doChunk(self, instructionTxt):
        """
        The method takes a string and returns a list of Annotations,
        which I should rename to SpatialDescriptionClause.  Each
        Annotation contains a figure, a ground, a spatial relation,
        and a verb.  It uses standoff tags internally so you know
        exactly what part of the input string is part of each field.
        """

        annotations = []
        for sentenceStandoff in self.sentenceTokenizer.tokenize(
                instructionTxt):
            indexes, tokens, labels = self.chunker.chunk(sentenceStandoff.text)

            def nullMap():
                return dict([(key, TextStandoff(instructionTxt, (0, 0)))
                             for key in Annotation.keys])

            offset = sentenceStandoff.start
            annotation = nullMap()
            currentField = None
            currentStandoff = TextStandoff(instructionTxt, (0, 0))
            #print
            #print
            for index, token, label in zip(indexes, tokens, labels):
                if currentField != label:
                    if not currentStandoff.isNull():
                        annotation[currentField] = currentStandoff

                    if label == "None" or not annotation[label].isNull():
                        #print "adding annotation."
                        #for key in Annotation.keys:
                        #    print key, annotation[key].text

                        if not hasAllNullValues(annotation):
                            annotations.append(Annotation(**annotation))
                            annotation = nullMap()

                    currentStandoff = TextStandoff(
                        instructionTxt,
                        (index + offset, index + len(token) + offset))
                    currentField = label
                else:
                    currentStandoff.range = (currentStandoff.start,
                                             index + len(token) + offset)

            if not currentStandoff.isNull():
                annotation[currentField] = currentStandoff
            # only add if there are non-null fields.
            if not hasAllNullValues(annotation):
                annotations.append(Annotation(**annotation))
        return annotations
Ejemplo n.º 10
0
class RecipeManager:
    """
    This is the high-level class that manages calls to recipe
    inference.
    """
    def __init__(self, model_fname):
        self.newcf = CostFnCrf.from_mallet(
            model_fname,
            guiMode=True,
            feature_extractor_cls=kitchen_features.GGGFeatures)
        self.task_planner = nodeSearch.BeamSearch(self.newcf)
        self.sentence_tokenizer = SentenceTokenizer()

    def make_ggg_for_instruction(self, text):
        esdc = ExtendedSdc("EVENT", text, r=TextStandoff(text, (0, len(text))))
        ggg = ggg_from_esdc(esdc)
        return esdc, ggg

    def find_plan(self, recipe_text, initial_state):
        """
        Returns a sequence of states and actions, given the text of
        the recipe (not including the ingredients) and the initial
        KitchenState.

        Returns a list of (instruction, state_seq), where each
        instruction is a TextStandoff into the original recipe.

        The state_seq is a list of (action, state).  The action is the
        action that was executed to create the state.  The sequence
        starts with initial_state, but this state is not returned in
        the list.

        To iterate through the results, do something like this: 
        
        for instruction, state_seq in whole_sequence:
          for action, state in state_seq:
             print "action", action

        """

        instructions = self.sentence_tokenizer.tokenize(recipe_text)
        current_state = initial_state
        whole_sequence = []
        for instruction in instructions:
            #print "inst",  type(instruction), instruction
            if len(instruction.text) < 3:
                continue
            try:
                esdc, ggg = self.make_ggg_for_instruction(instruction.text)
                results = self.task_planner.find_plan(current_state, [ggg],
                                                      save_state_tree=True,
                                                      allow_null_action=False,
                                                      search_depth_event=7,
                                                      beam_width_event=10,
                                                      beam_width=10)
                if len(results) != 0:
                    state_sequence = self.sequence(results[0])
                    (cost, idx), state, ggg = results[0]
                    #print "sequence: ", results[0]
                    #probability = math.exp(-cost)
                    #print probability

                    whole_sequence.append((instruction, state_sequence, cost))
                    current_state = state_sequence[-1][1]
            except:
                print "recipe", recipe_text
                print "exception on", instruction
                raise
        return whole_sequence

    def find_viterbi_plan(self, recipe_text, initial_state):
        preprocessed_instructions = self.sentence_tokenizer.tokenize(
            recipe_text)
        instructions = []
        for i in preprocessed_instructions:
            if len(i.text) > 3:
                instructions.append(i)
        #FOR TESTING ONLY. PLEASE REMOVE THE FOLLOWING LINE
        #instructions = instructions[0:2]
        #print "length of instructions", len(instructions), "down from", len(preprocessed_instructions)

        viterbi_results = self.recurse_viterbi(initial_state, [], instructions,
                                               0)
        #print "VIT RES                ", viterbi_results, len(viterbi_results)
        print "Number of possible paths found:", len(viterbi_results)
        lowest = viterbi_results[0][-1]
        index = 0
        for i in range(len(viterbi_results)):

            for j in viterbi_results[i][0]:
                pass
                #print j


##                state_sequence = self.sequence(j)
##                for k in state_sequence:
##                    print k

            if viterbi_results[i][-1] < lowest:
                lowest = viterbi_results[i][-1]
                index = i
            elif viterbi_results[i][-1] == lowest:
                print "tie: " + str(i)
        print index, lowest
        print viterbi_results[index]
        for i in viterbi_results[index][0]:
            #print self.sequence(i)
            pass
        return viterbi_results[index]
        #return whole_sequence

    def recurse_viterbi(self,
                        initial_state,
                        current_path,
                        instructions,
                        depth=0):
        #print "Viterbi time"
        #Is this the correct starting probability?
        #Format: ([complete path], probability)
        #current_state = current_path[-1][1]
        current_state = initial_state
        instruction = instructions[depth]
        esdc, ggg = self.make_ggg_for_instruction(instruction.text)
        results = self.task_planner.find_plan(current_state, [ggg],
                                              save_state_tree=True,
                                              allow_null_action=False,
                                              search_depth_event=10,
                                              beam_width_event=5,
                                              beam_width=5)
        for i in results:
            temp = self.sequence(i)
            for j in temp:
                pass
                #This will print out the possible children of the current state as returned by find_plan.
                #print j
            #print "\\"
        #print "-"
        whole_sequence = []
        if depth < (len(instructions) - 1):
            #print "Recursing. Results length:", len(results)
            viterbi_results = []
            #Recurse some more
            for i in results:
                (cost, idx), state, ggg = i
                viterbi_results.extend(
                    self.recurse_viterbi(state, current_path + [i],
                                         instructions, (depth + 1)))
            return viterbi_results

        else:
            #print "Not recursing. Results length:", len(results)
            possible_paths = []
            for i in results:
                #print len(current_path)
                (cost, idx), state, ggg = i
                #temp_path = copy.deepcopy(current_path)
                #temp_path.append((state, cost))
                totalProb = cost
                for i in current_path:
                    totalProb += i[0][0]
                possible_paths.append((current_path + [i], totalProb))
                #possible_paths.append( (temp_path, totalProb) )
            return possible_paths

    def find_dijkstra_plan(self, recipe_text, initial_state):
        #print "Dijkstra time! :)"
        pathFound = False
        preprocessed_instructions = self.sentence_tokenizer.tokenize(
            recipe_text)
        instructions = []
        for i in preprocessed_instructions:
            if len(i.text) > 3:
                instructions.append(i)
            pass
        depth = len(instructions)
        paths = []
        paths.append(KitchenPath(initCost=0.0, initPath=[initial_state]))
        print paths
        while (len(paths) > 0):
            print "looking for new path", len(paths)
            likeliestPath = min(paths, key=lambda x: x.cost)
            paths.remove(likeliestPath)
            print "found my short path of cost", likeliestPath.cost, "at depth", likeliestPath.depth
            currentState, currentDepth = likeliestPath.currentNode()
            if (currentDepth == depth):
                print "Possible paths expanded: ", len(paths)
                return (likeliestPath.wholePath(), likeliestPath.cost)
            currentInstruction = instructions[currentDepth]
            esdc, ggg = self.make_ggg_for_instruction(currentInstruction.text)
            results = self.task_planner.find_plan(currentState, [ggg],
                                                  save_state_tree=True,
                                                  allow_null_action=False,
                                                  search_depth_event=10,
                                                  beam_width_event=5,
                                                  beam_width=5)
            if len(results) != 0:
                for i in results:
                    (cost, idx), state, ggg = i
                    print likeliestPath
                    paths.append(likeliestPath.addNode(i))

        print "No paths found :("

    def find_beam_plan(self, recipe_text, initial_state):
        beam_width = 2
        preprocessed_instructions = self.sentence_tokenizer.tokenize(
            recipe_text)
        instructions = []
        for i in preprocessed_instructions:
            if len(i.text) > 3:
                instructions.append(i)
            pass
        depth = len(instructions)
        paths = []
        paths.append(KitchenPath(initCost=0.0, initPath=[initial_state]))

        for i in range(len(instructions)):
            print "Instruction level:", i
            tempPaths = []
            currentInstruction = instructions[i]
            for j in range(len(paths)):
                print "Path #:", j
                currentState, currentDepth = paths[j].currentNode()
                esdc, ggg = self.make_ggg_for_instruction(
                    currentInstruction.text)
                results = self.task_planner.find_plan(currentState, [ggg],
                                                      save_state_tree=True,
                                                      allow_null_action=False,
                                                      search_depth_event=7,
                                                      beam_width_event=10,
                                                      beam_width=10)
                if len(results) != 0:
                    k = 0
                    while (k < min(beam_width, len(results))):
                        state_sequence = self.sequence(results[k])
                        (cost, idx), state, ggg = results[k]
                        tempPaths.append(paths[j].addNode(
                            results[k],
                            (currentInstruction, state_sequence, cost)))
                        k += 1
                else:
                    tempPaths.append(paths[j].copy())

            paths = tempPaths
        likeliestPath = min(paths, key=lambda x: x.cost)
        return likeliestPath.getWholeSequence()

    def sequence(self, plan):
        cost, state, ggg = plan
        return self.task_planner.state_sequence(state)
Ejemplo n.º 11
0
 def __init__(self, nltkTokenizer=PunktWordTokenizer()):
     self.nltkTokenizer = nltkTokenizer
     self.sentenceTokenizer = SentenceTokenizer()