def __init__(self, training_ds, cf): self.training_ds = training_ds self.cf = cf t = "" self.esdc_structures = [ ExtendedSdc(esdcType="PATH", entireText=t, r=[TextStandoff(t, (0, 0))], l=[ ExtendedSdc(esdcType="OBJECT", f=[TextStandoff(t, (0, 0))]) ]), #ExtendedSdc(esdcType="EVENT", entireText=t, # r=[TextStandoff(t, (0, 0))], # l=[TextStandoff(t, (0, 0))], # l2=[TextStandoff(t, (0, 0))]) ] self.esdc_field_to_texts = {} groundings = set([]) for ex in self.training_ds.observations: for sdc in ex.sdcs: for field in ExtendedSdc.fieldNames: self.esdc_field_to_texts.setdefault(field, []) text = sdc.childText(field) if text != "": self.esdc_field_to_texts[field].append(text) if len(groundings) < 10: for glist in ex.annotation.groundings: groundings.update(glist) self.groundings = list(groundings)
def testBreadthFirstTraverseLoop(self): cmd = "Move in front of the pallet of boxes in the center and wait." esdc = ExtendedSdc( 'EVENT', r=[TextStandoff(cmd, (0, 4)), TextStandoff(cmd, (5, 7))], l2=[], f=[TextStandoff(cmd, (8, 13))]) esdc.l = [esdc] def callback(esdcFromParent): print "callback" dataStructures.breadthFirstTraverse(esdc, callback)
def word_tokenize(self, string): """ Tokenize a string, returning a tuple. The first element is a list of starting locations for the tokens and the second element is a list of tokens. """ tokens = self.nltkTokenizer.tokenize(string) indexes = [] startIdx = 0 for token in tokens: idx = string.index(token, startIdx) indexes.append(idx) startIdx = idx + len(token) if len(tokens) > 0: lastToken = tokens[-1] if len(lastToken) > 1 and lastToken[-1] in ('?', '.', '!'): lastCharacter = lastToken[-1] tokens[-1] = lastToken[0:-1] tokens.append(lastCharacter) indexes.append(indexes[-1] + len(lastToken) - 1) return [ TextStandoff(string, (i, i + len(token))) for i, token in zip(indexes, tokens) ]
def recipesToDataset(recipes): observations = [] task_planner = nodeSearch.BeamSearch(CostFnRandom()) fe = kitchen_features.GGGFeatures() for recipe in recipes: print "training", recipe.name for idx in range(recipe.num_instructions): instruction = recipe.idx_to_instruction(idx) esdc = ExtendedSdc("EVENT", instruction, r=TextStandoff(instruction, (0, len(instruction)))) ggg = ggg_from_esdc(esdc) states = recipe.idx_to_states(idx) example_id = "%s_%d" % (recipe.name, idx) features = fe.features(ggg, ggg.factors[0], states) obs = dataset.ContinuousObservation(example_id, True, True, features, sdcs=[esdc]) observations.append(obs) negative_obs = negativeExamples(task_planner, fe, recipe.idx_to_start_state(idx), esdc, ggg, example_id, states) observations.extend(negative_obs) return dataset.ContinuousDataset(observations)
def standoffForString(relation, sentence): try: if relation.startswith("prep"): preposition = relation[relation.index("_") + 1:].replace("_", " ") index = sentence.lower().find(preposition + " ") if index == -1: index = sentence.lower().find(preposition) string = sentence[index:index + len(preposition)] standoffs = [] offset = index for token in string.split(" "): standoffs.append( TextStandoff(sentence, (offset, offset + len(token)))) offset += len(token) + 1 return (standoffs, preposition) else: raise ValueError("Invalid relation: " + ` relation ` + " in " + ` sentence `) except: print "relation", relation print "sentence", sentence raise
def updateRep(self): if not hasattr(self, "frozen"): self.frozen = False for key, values in self: for value in values: if self.entireText == None: self.entireText = value.entireText else: assert self.entireText == value.entireText, ( self.entireText, value.entireText, key, str(value), [str(x) for x in values]) self.fields[key] = sorted(values, key=lambda x: x.range[0]) minRange = len(self.entireText) maxRange = 0 for key, values in self: for value in values: minValueRange, maxValueRange = value.range if minValueRange < minRange: minRange = minValueRange if maxValueRange > maxRange: maxRange = maxValueRange self.range = minRange, maxRange self.startIdx = minRange self.endIdx = maxRange self.text = self.entireText[self.startIdx:self.endIdx] self.flattenedEsdcs = flattenedEsdcs([self]) self.checkRep() self._repr = self.recomputeRepr() self._hash = hash(self._repr) self.hash_string = fasthash(self._repr) self.standoff = TextStandoff(self.entireText, self.range)
def loadSdc(xmlElement, text): argMap = {} for field in xmlElement.childNodes: if isinstance(field, xml.dom.minidom.Element): start = int(field.getAttribute("start")) end = int(field.getAttribute("end")) argMap[str(field.nodeName)] = TextStandoff(text, (start, end)) return Annotation(**argMap)
def __init__(self, **args): self.annotationMap = args self.entireText = None self.instructionIdx = None for key, value in self.annotationMap.iteritems(): if not (value is None): if self.entireText is None: self.entireText = value.entireText else: assert self.entireText == value.entireText, ( self.entireText, value.entireText) for key in Annotation.keys: if not (key in self.annotationMap) or self.annotationMap[key] is None: self.annotationMap[key] = TextStandoff(self.entireText, (0, 0)) self.range = TextStandoff(self.entireText, enclosingRange(self.annotationMap.values()))
def testCorrectStandoffs(self): annotations = yamlReader.load("data/forklift_open_ended.yaml") esdc1 = annotations[0].esdcs[0] esdc2 = annotations[1].esdcs[0] new_entire_text = esdc1.entireText + " " + esdc2.entireText sentence_standoff = TextStandoff(new_entire_text, esdc1.range) correctStandoffs(sentence_standoff, esdc1) self.assertEqual(esdc1.entireText, new_entire_text)
def setUp(self): self.sentence = "Pick up the tire pallet." self.esdc1 = ExtendedSdc("EVENT", r=TextStandoff(self.sentence, (0, 7)), l=TextStandoff(self.sentence, (8, 23))) self.esdc2 = ExtendedSdc("EVENT", r=TextStandoff(self.sentence, (0, 7)), l=TextStandoff(self.sentence, (8, 23))) self.esdc3 = ExtendedSdc("EVENT", r=TextStandoff(self.sentence, (0, 7)), l=TextStandoff(self.sentence, (8, 22))) self.sentence2 = "Pick up the tire pallet near the truck." self.childEsdc = ExtendedSdc("OBJECT", f=TextStandoff(self.sentence2, (8, 23)), r=TextStandoff(self.sentence2, (24, 28)), l=TextStandoff(self.sentence2, (29, 38))) self.parentEsdc = ExtendedSdc("EVENT", r=TextStandoff(self.sentence2, (0, 7)), l=self.childEsdc)
def tokenize(self, string): instructions = string sentences = self.tokenizer.tokenize(instructions) standoffs = [] lastStart = 0 for sentence in sentences: startIdx = instructions.index(sentence, lastStart) endIdx = startIdx + len(sentence) standoffs.append(TextStandoff(string, (startIdx, endIdx))) lastStart = endIdx for s1 in standoffs: for s2 in standoffs: assert s1 == s2 or not s1.overlaps(s2) return standoffs
def testNestedRepeatedStrings(self): from esdcs.dataStructures import ExtendedSdc, ExtendedSdcGroup from standoff import TextStandoff txt = "Move to the right side of the trailer of the trailer on the right and wait." esdcs = [ExtendedSdc('EVENT', r=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (0, 4))],l2=[],l=[ExtendedSdc('PATH', r=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (5, 7))],l2=[],l=[ExtendedSdc('OBJECT', r=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (23, 25))],l2=[ExtendedSdc('OBJECT', r=[],l2=[],l=[],f=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (41, 44)), TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (45, 52))])],l=[ExtendedSdc('OBJECT', r=[],l2=[],l=[],f=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (26, 29)), TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (30, 37))])],f=[ExtendedSdc('OBJECT', r=[],l2=[],l=[],f=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (8, 11)), TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (12, 17)), TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (18, 22))])])],f=[])],f=[ExtendedSdc('OBJECT', r=[],l2=[],l=[],f=[], entireText='Move to the right side of the trailer of the trailer on the right and wait.')]), ExtendedSdc('OBJECT', r=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (53, 55))],l2=[],l=[ExtendedSdc('OBJECT', r=[],l2=[],l=[],f=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (56, 59)), TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (60, 65))])],f=[ExtendedSdc('OBJECT', r=[],l2=[],l=[],f=[TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (41, 44)), TextStandoff("Move to the right side of the trailer of the trailer on the right and wait.", (45, 52))])])] entireText, yamlData = esdcIo.toYaml(ExtendedSdcGroup(esdcs)) rereadAnnotations = esdcIo.fromYaml(entireText, yamlData) try: self.assertEqual(list(rereadAnnotations), esdcs) except: print "start with", [e.asPrettyMap() for e in esdcs] print "ended with", [e.asPrettyMap() for e in rereadAnnotations] raise
def testCorrectStandoffsImmutable(self): annotations = yamlReader.load("data/forklift_open_ended.yaml") esdc1 = annotations[0].esdcs[0] esdc2 = annotations[1].esdcs[0] old_entire_text = esdc1.entireText new_entire_text = esdc1.entireText + " " + esdc2.entireText sentence_standoff = TextStandoff(new_entire_text, esdc1.range) correctedEsdc1 = correctStandoffsImmutable(sentence_standoff, ExtendedSdcGroup([esdc1])) self.assertEqual(esdc1.entireText, old_entire_text) self.assertEqual(correctedEsdc1.entireText, new_entire_text) print str(correctedEsdc1[0]) self.assertEqual(" ".join(x.text for x in correctedEsdc1[0]["f"]), "Forklift")
def standoffFromToken(txt, token): return TextStandoff(txt, (token.beginPosition(), token.endPosition()))
def handleEsdcContents(argMap, entireText): outputDict = {} for argName, argValue in argMap.iteritems(): assert argName in ExtendedSdc.fieldNames, ("Arg " + ` argName ` + " not in names." + " Value: " + ` argValue `) if argName == "id": outputDict["esdc_id"] = argValue elif isinstance(argValue, str): matches = list(re.finditer(re.escape(argValue), entireText)) if len(matches) == 1: match = matches[0] tokens = [] matchText = match.group() currentIndex = 0 for token in matchText.split(): # we've ensured the index both exists and is unique. tokenIdx = matchText[currentIndex:].index( token) + currentIndex standoff = TextStandoff( entireText, (match.start() + tokenIdx, match.start() + tokenIdx + len(token))) currentIndex = tokenIdx + len(token) tokens.append(standoff) outputDict[argName] = tokens else: candidates = [[match.start(), match.end()] for match in matches] token = argValue tokenizer = IndexedTokenizer() for candidate in candidates: print "candidate", candidate for standoff in tokenizer.tokenize(argValue): print "- -", standoff.text start_idx = standoff.start print " - [%d, %d]" % (candidate[0] + start_idx, candidate[0] + start_idx + len(token)) raise ValueError("Must provide indices for token: '" + argValue + "' in text '" + entireText + "'." " matches: " + ` candidates `) elif isEsdc(argValue): outputDict[argName] = list(fromYaml(entireText, argValue)) elif isWordList(argValue): tokens = [] try: for token, (start, end) in argValue: substring = entireText[start:end] if substring != token: print "candidates" for match in re.finditer(token, entireText): print[match.start(), match.end()] raise ValueError("Token '" + token + "' must correspond" + " to index " + ` (start, end) ` + "and not '" + substring + "'.") tokens.append(TextStandoff(entireText, (start, end))) except: print "Problem with", argValue raise outputDict[argName] = tokens else: raise ValueError("Must be strings or ESDCs: " + ` argValue `) return outputDict
def testDegreeOfOverlap(self): self.assertEqual( TextStandoff("Testing 123", (0, 1)).degreeOfOverlap( TextStandoff("Testing 123", (0, 1))), 1) self.assertEqual( TextStandoff("Testing 123", (0, 1)).degreeOfOverlap( TextStandoff("Testing 123", (1, 2))), 0) self.assertEqual( TextStandoff("Testing 123", (1, 2)).degreeOfOverlap( TextStandoff("Testing 123", (0, 1))), 0) self.assertEqual( TextStandoff("Testing 123", (0, 10)).degreeOfOverlap( TextStandoff("Testing 123", (1, 2))), 1) self.assertEqual( TextStandoff("Testing 123", (0, 10)).degreeOfOverlap( TextStandoff("Testing 123", (9, 10))), 1) self.assertEqual( TextStandoff("Testing 123", (0, 10)).degreeOfOverlap( TextStandoff("Testing 123", (10, 11))), 0) self.assertEqual( TextStandoff("Testing 123", (0, 10)).degreeOfOverlap( TextStandoff("Testing 123", (8, 11))), 2) self.assertEqual( TextStandoff("Testing 123", (8, 11)).degreeOfOverlap( TextStandoff("Testing 123", (0, 10))), 2) self.assertEqual( TextStandoff("Testing 123", (0, 5)).degreeOfOverlap( TextStandoff("Testing 123", (-1, 10))), 5) self.assertEqual( TextStandoff("Testing 123", (0, 5)).degreeOfOverlap( TextStandoff("Testing 123", (6, 125))), 0)
def testContains(self): self.assertEqual( TextStandoff("Testing 123", (0, 1)).contains(TextStandoff("Testing 123", (0, 1))), True) self.assertEqual( TextStandoff("Testing 123", (0, 1)).contains(TextStandoff("Testing 123", (1, 2))), False) self.assertEqual( TextStandoff("Testing 123", (1, 2)).contains(TextStandoff("Testing 123", (0, 1))), False) self.assertEqual( TextStandoff("Testing 123", (0, 10)).contains(TextStandoff("Testing 123", (1, 2))), True) self.assertEqual( TextStandoff("Testing 123", (0, 10)).contains( TextStandoff("Testing 123", (9, 10))), True) self.assertEqual( TextStandoff("Testing 123", (0, 10)).contains( TextStandoff("Testing 123", (10, 11))), False)
def testOverlaps(self): self.assertEqual( TextStandoff("Testing 123", (0, 1)).overlaps(TextStandoff("Testing 123", (0, 1))), True) self.assertEqual( TextStandoff("Testing 123", (0, 1)).overlaps(TextStandoff("Testing 123", (1, 2))), False) self.assertEqual( TextStandoff("Testing 123", (0, 10)).overlaps(TextStandoff("Testing 123", (1, 2))), True) self.assertEqual( TextStandoff("Testing 123", (0, 10)).overlaps( TextStandoff("Testing 123", (9, 10))), True) self.assertEqual( TextStandoff("Testing 123", (0, 10)).overlaps( TextStandoff("Testing 123", (10, 11))), False) self.assertEqual( TextStandoff("Testing 123", (10, 11)).overlaps( TextStandoff("Testing 123", (0, 10))), False)
def testBefore(self): self.assertEqual( TextStandoff("Testing 123", (0, 1)).before(TextStandoff("Testing 123", (0, 1))), False) self.assertEqual( TextStandoff("Testing 123", (0, 1)).before(TextStandoff("Testing 123", (1, 2))), True) self.assertEqual( TextStandoff("Testing 123", (1, 2)).before(TextStandoff("Testing 123", (0, 1))), False) self.assertEqual( TextStandoff("Testing 123", (0, 10)).before(TextStandoff("Testing 123", (1, 2))), False) self.assertEqual( TextStandoff("Testing 123", (0, 10)).before(TextStandoff("Testing 123", (9, 10))), False) self.assertEqual( TextStandoff("Testing 123", (0, 10)).before(TextStandoff("Testing 123", (10, 11))), True)
def extractEsdcList(self, sentence, dependencies): """ Returns a list of ESDCs, without hierarchy. """ esdcs = [] child_esdcs = [] leftover_deps = [] for relation, gov, dep in dependencies.dependencies: #print "Relation:", relation if relation == "root": continue if relation == "prep": esdc = ExtendedSdc("EVENT", r=gov, l=ExtendedSdc("EVENT", r=dep)) elif relation.startswith("prep"): prepStandoff, prep = standoffForString(relation, sentence) govTag = dependencies.tagForTokenStandoff(gov) if prep in [ "on", "in", "at", "near", "next to", "in front of", "behind", "away from", "close to", "closer to" ]: esdcType = "PLACE" else: esdcType = "PATH" #if relation.startswith("prepc"): if not govTag in ["NN", "NNS", "NNP"]: esdc = ExtendedSdc("EVENT", r=gov, l=ExtendedSdc(esdcType, r=prepStandoff, l=dep)) child_esdcs.extend(esdc.l) esdcs.append(esdc) else: if govTag in ["NN", "NNS", "NNP"]: esdc = ExtendedSdc("OBJECT", f=gov, r=prepStandoff, l=dep) else: esdc = ExtendedSdc( "EVENT", r=gov, l=[ExtendedSdc(esdcType, r=prepStandoff, l=dep)]) child_esdcs.extend(esdc.l) esdcs.append(esdc) elif relation == "conj_and": esdcs.append(ExtendedSdc("EVENT", r=[gov])) esdcs.append(ExtendedSdc("EVENT", r=[dep])) elif self.sdh.isa(relation, "arg"): esdc = ExtendedSdc("EVENT", r=gov, l=dep) esdcs.append(esdc) elif self.sdh.isa(relation, "subj"): esdc = ExtendedSdc("EVENT", f=dep, r=gov) esdcs.append(esdc) elif relation == "conj_and": pass elif relation == "dep": esdc = ExtendedSdc("EVENT", r=[dep, gov]) esdcs.append(esdc) else: leftover_deps.append((relation, gov, dep)) for relation, gov, dep in leftover_deps: for esdc in flattenedEsdcs(esdcs): #chain(esdcs, child_esdcs): for key, valueList in esdc: if gov in valueList: valueList.append(dep) freeze(esdcs) esdcs = list(sorted(set(esdcs))) unfreeze(esdcs) esdcs = mergeAll(esdcs) esdcs = mergeAll(esdcs) #esdcs = mergeAll(esdcs) #for esdc in flattenedEsdcs(esdcs): # esdc.updateRep() for esdc in flattenedEsdcs(esdcs): esdc.updateRep() if len(esdcs) == 0: print "sentence", sentence, sentence.__class__ sentence_standoff = TextStandoff(sentence, (0, len(sentence))) return [ExtendedSdc("EVENT", r=sentence_standoff)] return esdcs
def testEqual1(self): e1 = ExtendedSdc( 'EVENT', r=[TextStandoff("Load the forklift onto the trailer.", (0, 4))], l2=[ ExtendedSdc( 'PATH', r=[ TextStandoff("Load the forklift onto the trailer.", (18, 22)) ], l2=[], l=[ ExtendedSdc( 'OBJECT', r=[], l2=[], l=[], f=[ TextStandoff( "Load the forklift onto the trailer.", (23, 26)), TextStandoff( "Load the forklift onto the trailer.", (27, 34)) ]) ], f=[]) ], l=[ ExtendedSdc( 'OBJECT', r=[], l2=[], l=[], f=[ TextStandoff("Load the forklift onto the trailer.", (5, 8)), TextStandoff("Load the forklift onto the trailer.", (9, 17)) ]) ], f=[ ExtendedSdc('OBJECT', r=[], l2=[], l=[], f=[], entireText='Load the forklift onto the trailer.') ]) e2 = ExtendedSdc( 'EVENT', r=[TextStandoff("Load the forklift onto the trailer.", (0, 4))], l2=[ ExtendedSdc( 'PATH', r=[ TextStandoff("Load the forklift onto the trailer.", (18, 22)) ], l2=[], l=[ ExtendedSdc( 'OBJECT', r=[], l2=[], l=[], f=[ TextStandoff( "Load the forklift onto the trailer.", (23, 26)), TextStandoff( "Load the forklift onto the trailer.", (27, 34)) ]) ], f=[]) ], l=[ ExtendedSdc( 'OBJECT', r=[], l2=[], l=[], f=[ TextStandoff("Load the forklift onto the trailer.", (5, 8)), TextStandoff("Load the forklift onto the trailer.", (9, 17)) ]) ], f=[ ExtendedSdc('OBJECT', r=[], l2=[], l=[], f=[], entireText='Load the forklift onto the trailer.') ]) self.assertEqual(e1, e2) self.assertFalse(e1 != e2) self.assertTrue(e1 in [e2]) self.assertTrue(e2 in [e1])
def testSetter(self): newL = [TextStandoff(self.sentence, (0, 4))] self.esdc1.l = newL self.assertEqual(self.esdc1.fields['l'], newL) self.assertNotEqual(self.esdc1, self.esdc2)
def makeNullTextStandoff(text): return TextStandoff(text, (0, 0))
def make_ggg_for_instruction(self, text): esdc = ExtendedSdc("EVENT", text, r=TextStandoff(text, (0, len(text)))) ggg = ggg_from_esdc(esdc) return esdc, ggg