def process_batch( self, dp: DataPanel, columns: List[str], **kwargs, ) -> tuple: """ Args: dp (DataPanel): DataPanel columns (list): list of columns **kwargs: optional keyword arguments Returns: Tuple with single output """ assert len(columns) == 2, "Exactly two columns required." # Lookup the sentences in the given columns [lookup(dp, SpacyOp, [col]) for col in columns] sentences = SpacyOp.retrieve( batch=dp, columns=[[col] for col in columns], proc_fns=SpacyOp.sentences, ) return self.similarity(*[sentences[col] for col in columns])
def score( self, batch: DataPanel, columns: List[str], *args, **kwargs, ) -> np.ndarray: # Require that the number of keys is exactly 2 assert len(columns) == 2, "Must specify exactly 2 keys." # Retrieve the trees trees = { col: lookup(batch, AllenConstituencyParsingOp, [col]) for col in columns } trees_0, trees_1 = trees[columns[0]], trees[columns[1]] # Fuzzy match the trees and return the `scores` return np.array( [ fuzz.partial_token_set_ratio( tree_0.replace("(", "").replace(")", "").replace(" ", ""), tree_1.replace("(", "").replace(")", "").replace(" ", ""), ) for tree_0, tree_1 in zip(trees_0, trees_1) ] )
def test_apply(self): op = ActivationOp(model=self.model, target_module="hidden") dataset = op(self.dataset, columns=["i"]) # Make sure things match up acts = lookup(dataset, op, ["i"]) self.assertEqual(type(acts), list) acts = torch.stack(acts) self.assertTrue(torch.all(torch.eq(acts, 0))) self.assertEqual(list(acts.shape), [4, 2, 10, 10])
def score( self, batch: DataPanel, columns: List[str], *args, **kwargs, ) -> np.ndarray: # Require that the number of keys is exactly 2 assert len(columns) == 2, "Must specify exactly 2 keys." # Retrieve the trees trees = { col: lookup(batch, AllenConstituencyParsingOp, [col]) for col in columns } trees_0, trees_1 = trees[columns[0]], trees[columns[1]] # Convert the trees corresponding to key 0 to NLTK trees trees_0 = [nltk.Tree.fromstring(tree) for tree in trees_0] # Find all subtrees of these trees all_subtrees_0 = [ set( [ str(t).replace("\n", "").replace(" ", "").lower() for t in tree_0.subtrees() ] ) for tree_0 in trees_0 ] # Output a fuzzy score if the tree corresponding to key 1 is similar to any # subtree return np.array( [ max( [ fuzz.partial_ratio( tree_1.replace(" ", "") .replace("(..)", "") .replace("(,,)", "") .lower(), subtree, ) for subtree in subtrees_0 ] ) for tree_1, subtrees_0 in zip(trees_1, all_subtrees_0) ] )
def prepare_batch( self, batch: DataPanel, columns: List[str], *args, **kwargs, ) -> None: # Compute the scores if isinstance(self.score, Operation): self.scores.extend(lookup(batch, self.score, columns)) elif isinstance(self.score, Callable): self.scores.extend(self.score(batch, columns)) else: raise RuntimeError("score function invalid.")
def score( self, batch: DataPanel, columns: List[str], *args, **kwargs, ) -> np.ndarray: # Length of each example, for each column try: lengths = [[len(doc) for doc in lookup(batch, SpacyOp, [col])] for col in columns] except AttributeError: lengths = [[len(text.split()) for text in batch[col]] for col in columns] # Reduction over column key axis return self.reduction_fn(np.array(lengths), axis=0)
def test_apply(self): # Create the Bootleg cached operation bootleg = BootlegAnnotatorOp(cache_dir=self.cache_dir) dataset = bootleg(self.testbed.dataset, columns=["text"]) # Make sure things match up res = lookup(dataset, bootleg, ["text"]) bootleg_keys = [ "qids", "probs", "titles", "cands", "cand_probs", "spans", "aliases", ] for output in res: for k in bootleg_keys: assert k in output
def test_apply(self): # Create the Stanza cached operation stanza = StanzaOp() dataset = stanza(self.testbed.dataset, columns=["text"]) # Make sure things match up self.assertEqual( [doc.get("lemma") for doc in lookup( dataset, stanza, ["text"], )], [ ["the", "man", "be", "walk", "."], ["the", "man", "be", "run", "."], ["the", "woman", "be", "sprint", "."], ["the", "woman", "be", "rest", "."], ["the", "hobbit", "be", "fly", "."], ["the", "hobbit", "be", "swim", "."], ], )
def apply( self, batch: DataPanel, columns: List[str], slice_membership: np.ndarray = None, *args, **kwargs, ) -> np.ndarray: # Keep track of the score of each example if isinstance(self.score, Operation): scores = lookup(batch, self.score, columns) elif isinstance(self.score, Callable): scores = self.score(batch, columns) else: raise RuntimeError("score function invalid.") assert ( len(scores) == slice_membership.shape[0] ), "Must have exactly one score per example." return self.bin(scores=scores)
def score( self, batch: DataPanel, columns: List[str], *args, **kwargs, ) -> np.ndarray: # Require that the number of keys is exactly 2 assert len(columns) == 2, "Must specify exactly 2 keys." # Lookup the tokens after lower-casing and placing into a set try: tokens = { col: [ set([str(tok).lower() for tok in doc]) for doc in lookup(batch, SpacyOp, [col]) ] for col in columns } except AttributeError: tokens = { col: [ set([str(tok).lower() for tok in text.split()]) for text in batch[col] ] for col in columns } # Compute the intersection over union score return np.array( [ len(tokens_0.intersection(tokens_1)) / float(len(tokens_0.union(tokens_1))) for tokens_0, tokens_1 in zip(tokens[columns[0]], tokens[columns[1]]) ] )