def forward(self, graph, alreadySeenIn, activeNodesIn): """ Inputs: graph: A tuple (nbrs, nbrCounts) representing the adjacency grpah of nodes. alreadySeenIn: A set of node. activeNodesIn: All neighbors of nodeSet, which don't fall back into nodeSet. Outputs: alreadySeenOut: alreadySeenSetIn \\union activeNodeSet activeNodeOut: nbrs(activeNodesIn) - alreadySeenSetOut """ nbrs, nbrCounts = graph alreadySeenSet = set(alreadySeenIn.tolist()) # print("Start pycode: {0}".format(alreadySeenSet)) activeNodesSet = set() for activeNode in activeNodesIn.tolist(): # print("For {0}".format(activeNode)) nbrCount = int(nbrCounts[activeNode]) for nbr in nbrs[activeNode, 0:nbrCount].tolist(): if nbr not in alreadySeenSet: # print("\tNot found {0}".format(nbr)) activeNodesSet.add(int(nbr)) # else: # print("\tFound {0}".format(nbr)) # print("End pycode:") alreadySeenSet = alreadySeenSet.union(activeNodesSet) activeNodesOut = longTensor(sorted(activeNodesSet), device=activeNodesIn.device) alreadySeenOut = longTensor(sorted(alreadySeenSet), device=alreadySeenIn.device) return alreadySeenOut, activeNodesOut
def SpotNeighborsExplorerTest(testNo, appConfig, modelArgs, device): from hier2hier.models.spotNeighborsExplorer import SpotNeighborsExplorer nodeCount = random.randint(1, 100) maxNbrs = random.randint(1, nodeCount / 2) nbrs = torch.randint(nodeCount, (nodeCount, maxNbrs), device=device) nbrCounts = torch.randint(maxNbrs, (nodeCount, ), device=device) for node in range(nodeCount): for nbr in range(int(nbrCounts[node]), maxNbrs): nbrs[node, nbr] = -1 graph = (nbrs, nbrCounts) explorerUnderTest = SpotNeighborsExplorer(device=device) explorerToMatch = SpotNeighborsExplorer(impl_selection="python", device=device) startAtiveSetCount = random.randint(0, int(nodeCount / 3)) activeSetIn = longTensor(sorted( random.sample(range(nodeCount), startAtiveSetCount)), device=device) alreadySeenSetIn = activeSetIn.clone() while activeSetIn.shape[0]: alreadySeenOut1, activeSetOut1 = explorerToMatch( graph, alreadySeenSetIn, activeSetIn) alreadySeenOut2, activeSetOut2 = explorerUnderTest( graph, alreadySeenSetIn, activeSetIn) assert (set(alreadySeenOut1.tolist()) == set(alreadySeenOut2.tolist())) assert (len(alreadySeenOut1.tolist()) == len(alreadySeenOut2.tolist())) assert (set(activeSetOut1.tolist()) == set(activeSetOut2.tolist())) assert (len(activeSetOut1.tolist()) == len(activeSetOut2.tolist())) alreadySeenSetIn = alreadySeenOut1 activeSetIn = activeSetOut1
def avdl2Ndfo(self): node2AvdlList = self.node2AvdlList attrCount = len(self.__adfo2Toi) retval = [None for _ in range(attrCount)] for node, avdlList in node2AvdlList.items(): for avdl in avdlList: retval[avdl] = self.node2Ndfo[node] return longTensor(retval, device=self.device)
def encodedAttrSymbolsByAvdlp(self): attrValuesVocab = self.torchBatch.dataset.fields[ "src"].vocabs.attrValues attrValues = [ longTensor( [attrValuesVocab.stoi[ch] for ch in self.attrsByAdfo[adfo][1]], device=self.device, ) for adfo in self.avdl2Adfo if self.attrsByAdfo[adfo][1] ] if attrValues: return rnn.pack_sequence(attrValues) else: return longTensor( [], device=self.device, )
def posNbrhoodGraphByGndtol(self): """ Mapping of GNTDOL indices to GNI indices. """ adjListTensor = [ longTensor(sorted([ self._gni2Gndtol[nbrGni] for nbrGni in self.posNbrhoodGraphByGni[self.gndtol2Gni[gndtol]] ]), device=self.device) for gndtol in range(self.graphNodeCount) ] adjLengthsTensor = longTensor( [len(adjList) for adjList in adjListTensor], device=self.device) adjListTensor = rnn.pad_sequence(adjListTensor, batch_first=True) return (adjListTensor, adjLengthsTensor)
def encodedAttrLabelsByAvdl(self): attrsVocab = self.torchBatch.dataset.fields["src"].vocabs.attrs return longTensor( [ attrsVocab.stoi[self.attrsByAdfo[adfo][0]] for adfo in self.avdl2Adfo ], device=self.device, )
def parentSelectorByNdfo(self): return longTensor( [ self.node2Ndfo[ self.node2Parent[node]] # NDFO index of the parent. for node in self.ndfo2Node # Node at NDFO postion in the selector list. ], device=self.device, )
def avdlAttrSelectorsListByNdac(self): maxAttrCount = len(self.ndfo2Node[self.ndac2Ndfo[0]].attrib) retval = [[] for _ in range(maxAttrCount)] for avdlIndices in self.ndac2AvdlList: for attrNumber, avdlIndex in enumerate(avdlIndices): retval[attrNumber].append(avdlIndex) # Reverse, because we want the items to come in increasing order of length. retval.reverse() retval = [longTensor(item, device=self.device) for item in retval] return retval
def childSelectorByNdfoList(self): # Build NDFO child node selector lists. # The below loop only works because all nodes are in decreasing fanout order. maxNodeFanout = len(self.ndfo2Node[0]) retval = [[] for _ in range(maxNodeFanout)] for node in self.ndfo2Node: for childNumber, childNode in enumerate(node): ndfoChildNodeIndex = self.node2Ndfo[childNode] retval[childNumber].append(ndfoChildNodeIndex) retval.reverse() retval = [longTensor(item, device=self.device) for item in retval] return retval
def __iter__(self, mode=AppMode.Generate): if self.savedBatches is None: self.savedBatches = [] for batch in super().__iter__(): savedBatchData = AttrTuple() processedBatch = self.preprocess_batch(batch) savedBatchData.sampleCount = len(processedBatch.torchBatch.src) savedBatchData.encodedNodesByNdfo = processedBatch.encodedNodesByNdfo savedBatchData.parentSelectorByNdfo = processedBatch.parentSelectorByNdfo savedBatchData.childSelectorByNdfoList = processedBatch.childSelectorByNdfoList savedBatchData.decreasingFanoutsFactorByNdfo = processedBatch.decreasingFanoutsFactorByNdfo savedBatchData.encodedAttrLabelsByAvdl = processedBatch.encodedAttrLabelsByAvdl savedBatchData.encodedAttrSymbolsByAvdlp = processedBatch.encodedAttrSymbolsByAvdlp savedBatchData.avdl2Ndac = longTensor(processedBatch.avdl2Ndac, device=self.device) savedBatchData.ndac2Ndfo = longTensor(processedBatch.ndac2Ndfo, device=self.device) savedBatchData.avdl2Ndfo = processedBatch.avdl2Ndfo savedBatchData.avdlAttrSelectorsListByNdac = processedBatch.avdlAttrSelectorsListByNdac savedBatchData.decreasingAttrCountsFactorByNdac = processedBatch.decreasingAttrCountsFactorByNdac savedBatchData.encodedTextByTtDLP = processedBatch.encodedTextByTtDLP savedBatchData.encodedTailByTlDLP = processedBatch.encodedTailByTlDLP savedBatchData.ndttl2Ndac = longTensor( processedBatch.ndttl2Ndac, device=self.device) savedBatchData.ndtll2Ndttl = processedBatch.ndtll2Ndttl savedBatchData.ndfo2Ndtll = longTensor( processedBatch.ndfo2Ndtll, device=self.device) savedBatchData.ndfo2Ndac = longTensor(processedBatch.ndfo2Ndac, device=self.device) savedBatchData.targetOutputsByTdol = processedBatch.targetOutputsByTdol savedBatchData.targetOutputLengthsByTdol = processedBatch.targetOutputLengthsByTdol savedBatchData.targetOutputsByTdolList = processedBatch.targetOutputsByTdolList savedBatchData.tdol2Toi = processedBatch.tdol2Toi savedBatchData.toi2Tdol = processedBatch.toi2Tdol savedBatchData.gndtol2Tdol = processedBatch.gndtol2Tdol savedBatchData.goi2Gndtol = processedBatch.goi2Gndtol savedBatchData.gndtol2Gni = longTensor( processedBatch.gndtol2Gni, device=self.device) savedBatchData.posNbrhoodGraphByGndtol = processedBatch.posNbrhoodGraphByGndtol savedBatchData.fullSpotlight = processedBatch.fullSpotlight savedBatchData.targetOutputsByToi = processedBatch.targetOutputsByToi savedBatchData.targetOutputLengthsByToi = processedBatch.targetOutputLengthsByToi savedBatchData.srcSymbolsByGndtol = processedBatch.srcSymbolsByGndtol # Test attrs if mode == AppMode.Test: savedBatchData = processedBatch elif mode == AppMode.Evaluate: savedBatchData.inputs = processedBatch.inputs savedBatchData.outputs = processedBatch.outputs self.savedBatches.append(savedBatchData) for processedBatch in self.savedBatches: yield processedBatch
def encodedTextByNdtlp2(self): retval = [None, None] textVocab = self.torchBatch.dataset.fields["src"].vocabs.text for i, isTail in enumerate([False, True]): def getText(node): return node.tail if isTail else node.text # Get packed text. if self.ndtl2Node2[isTail]: result = [getText(node) for node in self.ndtl2Node2[isTail]] result = [ longTensor([textVocab.stoi[ch] for ch in text], device=self.device) for text in result if text not in [None, ""] ] if result: retval[i] = rnn.pack_sequence(result) else: retval[i] = None return retval
def goi2Gni(self): retval = [] # ndfo2Toi, avdl2Toi, avdlp2Toi, ndtlp2Toi2[0], ndtlp2Toi2[1]. # Process nodes. for xmlTree in self.inputs: for node in xmlTree.iter(): ndfo = self.node2Ndfo[node] retval.append(self.ndfo2Gni[ndfo]) # Process attributes. for xmlTree in self.inputs: for node in xmlTree.iter(): ndfo = self.node2Ndfo[node] ndac = self.ndfo2Ndac[ndfo] for attrIndex in range(len(node.attrib)): avdl = self.attrTuple2Avdl[(ndac, attrIndex)] retval.append(self.avdl2Gni[avdl]) # Process text and tail. for isTail in [False, True]: ndtxp2Gni = self.ndtlp2Gni if isTail else self.ndttp2Gni for xmlTree in self.inputs: for node in xmlTree.iter(): tailOrText = node.tail if isTail else node.text if not tailOrText: continue ndfo = self.node2Ndfo[node] ndtx2 = self.ndfo2Ndtl2[isTail][ndfo] for symIndex in range(len(tailOrText)): ndtxp2 = self.ndtxTuple2Ndtlp2[isTail][(ndtx2, symIndex)] retval.append(ndtxp2Gni[ndtxp2]) # Safety assertions. assert (len(retval) == self.graphNodeCount) assert (len(set(retval)) == self.graphNodeCount) return longTensor(retval, device=self.device)
def tdol2Toi(self): retval = list(range(len(self.targetOutputLengthsByToi))) retval.sort(key=lambda toi: (-self.targetOutputLengthsByToi[toi], toi)) return longTensor(retval, device=self.device)
def ndtll2Ndttl(self): return longTensor([self.ndfo2Ndttl[ndfo] for ndfo in self.ndtll2Ndfo], device=self.device)
def cullSmallFactors( gndtol2Tdol, beamMode, discoveredGndtol, attentionFactors, maxAttentionFactorByTDOL, spotlightThreshold, ): """ Inputs discoveredGndtol: Shape: sliCount attentionFactors: if beamMode: Shape: beamCount X sliCount else: Shape: sliCount maxAttentionFactorByTDOL if beamMode: Shape: treeCount X beamCount else: Shape: treeCount Outputs: """ # Get TDOL of each SLI. # Shape: sliCount # Value: tree index of the SLI. discoveredGndtol2Tdol = gndtol2Tdol[discoveredGndtol] # Indexing below # if beamMode: # Shape: sliCount X beamCount # else: # Shape: sliCount maxAttentionFactorToUse = maxAttentionFactorByTDOL[ discoveredGndtol2Tdol] if beamMode: # Shape: beamCount X sliCount # Permute last two dimensions to make it ready for comparison. maxAttentionFactorToUse = maxAttentionFactorToUse.permute(1, 0) # Purpose of comparison is to cull small(1/1000) factors. maxAttentionFactorToUse *= spotlightThreshold # Compare. # Shape: beamCount X SliCount. retainedIndicesBool = (attentionFactors > maxAttentionFactorToUse) if beamMode: # Collapse along beamCount dimension. # Retain if any beam is suggesting retention. retainedIndicesBool = (torch.sum(retainedIndicesBool, dim=0) != 0) retainedCount = torch.sum(retainedIndicesBool) if retainedCount == 0: return False elif retainedCount == len(retainedIndicesBool): return True retainedIndices = longTensor([ i for i, _ in enumerate(discoveredGndtol) if retainedIndicesBool[i] ], device=retainedIndicesBool.device) return retainedIndices
def fullSpotlight(self): return longTensor(list(range(self.graphNodeCount)), device=self.device)
def gni2Gndtol(self): """ Mapping of GNI indices to GNTDOL indices. """ return longTensor(self._gni2Gndtol, device=self.device)
def outputs(self): tgt, tgtLengths = self.torchBatch.tgt tgt = torch.tensor(tgt, device=self.device) tgtLengths = longTensor(tgtLengths, device=self.device) return tgt, tgtLengths
def gni2Tdol(self): return longTensor([int(self.toi2Tdol[toi]) for toi in self.gni2Toi], device=self.device)
def toi2Tdol(self): return longTensor(invertPermutation(self.tdol2Toi.tolist()), device=self.device)
def encodedNodesByNdfo(self): tagVocab = self.torchBatch.dataset.fields["src"].vocabs.tags encodedNodesData = [tagVocab.stoi[node.tag] for node in self.ndfo2Node] return longTensor(encodedNodesData, device=self.device)