def __init__(self, pipeline: "nlpaug_flow.Pipeline", num_transformed: int = 1, identifiers: List[Identifier] = None, *args, **kwargs): assert isinstance(pipeline, nlpaug_flow.Pipeline), ( "`pipeline` must be an nlpaug Pipeline object. Please use \n" "from nlpaug.flow import Sequential\n" "rg.NlpAugTransformation(pipeline=Sequential(flow=[...])).") super(NlpAugTransformation, self).__init__(num_transformed=num_transformed, identifiers=Identifier.range( n=num_transformed, _name=self.__class__.__name__, pipeline=[ Identifier( _name=augmenter.name, src=augmenter.aug_src if hasattr( augmenter, "aug_src") else None, action=augmenter.action, method=augmenter.method, ) for augmenter in pipeline ], ) if not identifiers else identifiers, *args, **kwargs) # Set the pipeline self.pipeline = pipeline
def __setstate__(self, state): state = dict(state) if "interactions" in state and isinstance(state["interactions"], str): state["interactions"] = self.loads_interactions( state["interactions"]).interactions if "identifier" in state and isinstance(state["identifier"], str): state["identifier"] = Identifier.loads(state["identifier"]) if "_identifier" in state: try: state["_identifier"] = Identifier.loads(state["_identifier"]) except: # noqa pass if "lineage" in state: try: state["lineage"] = [ tuple(t[:1]) + (Identifier.loads(t[1]), ) + (tuple(t[2:]) if len(t) > 2 else ()) for t in state["lineage"] ] except: # noqa pass if "logdir" in state: try: state["logdir"] = ( pathlib.Path.home() / f"robustnessgym/datasets/{str(state['identifier'])}") except: # noqa state["logdir"] = ( pathlib.Path.home() / f"robustnessgym/datasets/{str(state['_identifier'])}") super(Dataset, self).__setstate__(state)
def from_dataset( cls, dp: DataPanel, input_columns: List[str], output_columns: List[str], # prediction_columns: List[str], # metrics: List[str], ) -> TestBench: """Create a TestBench from a dp.""" # Define the task task = Task( # Identifier Identifier("Task", dp=str(dp.identifier)), # Input and output schemas *Schema.for_dataset(dp, input_columns, output_columns), ) # Create the testbench testbench = TestBench( identifier=Identifier("TestBench", dp=str(dp.identifier)), task=task, slices=[dp], ) # testbench.set_single_dataset_mode() # testbench.set_prediction_columns(prediction_columns) return testbench
def setUp(self): self.min_identifier = Identifier(_name="MyIdentifier") self.identifier = Identifier( _name="MyIdentifier", _index=1, param="a", param_2="b", )
def test_eq(self): # Two identifiers created with the same arguments should be equal identifier = Identifier(_name="MyIdentifier", _index=1, param="a", param_2="b") self.assertEqual(self.identifier, identifier) self.assertNotEqual(self.min_identifier, identifier) # But not two identifiers created with different arguments identifier = Identifier(_name="MyIdentifier", _index=2, param="a", param_2="b") self.assertNotEqual(self.identifier, identifier) self.assertNotEqual(self.min_identifier, identifier)
def setUp(self): # Arrange self.cachedop = CachedOperation( apply_fn=a_single_column_apply_fn, identifier=Identifier(_name="TestCachedOperation"), ) self.testbed = MockTestBedv0() self.multicol_cachedop = CachedOperation( apply_fn=a_multi_column_apply_fn, identifier=Identifier(_name="TestCachedOperation", to="multiple"), )
def __init__(self, *args, **kwargs): super(ConstituencySubtreeSubpopulation, self).__init__( intervals=[(1, 1)], identifiers=[Identifier(_name=self.__class__.__name__)], *args, **kwargs, )
def __init__(self): super(HansLocationNounsB, self).__init__( phrase_groups=[[ "museum", "school", "library", "office", "laboratory" ]], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__(self): super(HansUnderstoodArgumentVerbs, self).__init__( phrase_groups=[[ "paid", "explored", "won", "wrote", "left", "read", "ate" ]], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__(self): super(HansPastParticiples, self).__init__( phrase_groups=[[ "studied", "paid", "helped", "investigated", "presented" ]], identifiers=[Identifier(_name=self.__class__.__name__)], )
def update(self, identifier: Union[str, Identifier], columns: List[str]) -> None: """Update the interaction tape with information about an interaction. Args: identifier: Identifier for the interaction used. columns: list of columns on which the interaction was applied. Returns: True if the interaction was added to the tape, False if it was already applied before. """ if isinstance(identifier, str): identifier = Identifier(_name=identifier) elif isinstance(identifier, Identifier): pass else: raise ValueError( f"Parameter `identifier` should be an instance of class Identifier " f"or str, " f"not {type(identifier)}.") # Dump the column names to JSON json_columns = strings_as_json(strings=columns) # Check if the entry is not in the history if (identifier, json_columns) not in self.history: # Give it the next index self.history[(identifier, json_columns)] = len(self.history)
def test_from_jsonl(self): # Create a temporary directory os.makedirs("tmp", exist_ok=True) # Create a json file with data with jsonlines.open("tmp/data.jsonl", "w") as writer: writer.write_all( transpose_batch({ "a": [1, 2, 3], "b": [True, False, True], "c": ["x", "y", "z"], "d": [{ "e": 2 }, { "e": 3 }, { "e": 4 }], })) # Load the dataset dataset = Dataset.from_jsonl( json_path="tmp/data.jsonl", identifier=Identifier(_name="MockJSONDataset"), ) self.assertEqual(set(dataset.column_names), {"a", "b", "c", "d", "index"}) self.assertEqual(len(dataset), 3) # Remove the temporary directory shutil.rmtree("tmp")
def __init__(self): # Create a fake dataset self.dataset = Dataset.from_batch( { "text_a": [ "Before the actor slept, the senator ran.", "The lawyer knew that the judges shouted.", "If the actor slept, the judge saw the artist.", "The lawyers resigned, or the artist slept.", ], "text_b": [ "The actor slept.", "The judges shouted.", "The actor slept.", "The artist slept.", ], "label": [0, 0, 1, 1], "z": [1, 0, 1, 0], "fast": [False, True, True, False], }, identifier=Identifier(_name="MockDataset", version="2.0"), ) # Keep a copy of the original self.original_dataset = deepcopy(self.dataset) assert len(self.dataset) == 4
def __init__(self): super(HansConstAdv, self).__init__( phrase_groups=[ ["after", "before", "because", "although", "though", "since", "while"] ], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__(self): super(HansFoodWords, self).__init__( phrase_groups=[ ["fruit", "salad", "broccoli", "sandwich", "rice", "corn", "ice cream"] ], identifiers=[Identifier(_name=self.__class__.__name__)], )
def test_init(self): # Create a simple identifier with a name identifier = Identifier(_name="MyIdentifier") self.assertEqual(str(identifier), "MyIdentifier") # Create an identifier with a string index identifier = Identifier(_name="MyIdentifier", _index="abc") self.assertEqual(str(identifier), "MyIdentifier-abc") # Create an identifier with an integer index identifier = Identifier(_name="MyIdentifier", _index=1) self.assertEqual(str(identifier), "MyIdentifier-1") # Create an identifier with an integer index and two parameters identifier = Identifier(_name="MyIdentifier", _index=1, param="a", param_2="b") self.assertEqual(str(identifier), "MyIdentifier-1(param=a, param_2=b)")
def from_jsonl( cls, json_path: str, identifier: Identifier = None, dataset_fmt: str = "in_memory", ) -> Dataset: """Load a dataset from a .jsonl file on disk, where each line of the json file consists of a single example.""" if dataset_fmt == "in_memory": # Load the .jsonl file with open(json_path) as f: data = [json.loads(line) for line in f] return cls( data, identifier=identifier if identifier else Identifier("RGDataset", jsonl=json_path), dataset_fmt=dataset_fmt, ) elif dataset_fmt == "datasets": # Use jsonarrow to directly load the json return cls( jsonarrow.read_json(json_path), identifier=identifier, dataset_fmt=dataset_fmt, ) else: raise NotImplementedError
def __init__( self, dataset: str, model: str, constrain_pos: bool = True, **kwargs, ): super().__init__(identifiers=[ Identifier(self.__class__.__name__, dataset=dataset, model=model) ], ) self.constrain_pos = constrain_pos self.dataset = dataset.lower() if self.dataset == "mnli": self.attack = morpheus.MorpheusHuggingfaceNLI(model) elif "squad" in self.dataset: is_squad2 = "2" in self.dataset self.attack = morpheus.MorpheusHuggingfaceQA(model, squad2=is_squad2) elif self.dataset == "cnn_dailymail" or self.dataset == "xsum": rouge_type = kwargs.get("rouge_type", "rougeL") max_input_tokens = kwargs.get("max_input_tokens", 1024) self.attack = morpheus.MorpheusHuggingfaceQA( model, rouge_type=rouge_type, max_input_tokens=max_input_tokens) else: raise NotImplementedError
def __init__(self): super(HansNonEntQuotVerbs, self).__init__( phrase_groups=[ ["hoped", "claimed", "thought", "believed", "said", "assumed"] ], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__(self): super(HasNegation, self).__init__( phrase_groups=[ [ "no", "not", "none", "noone ", "nobody", "nothing", "neither", "nowhere", "never", "hardly", "scarcely", "barely", "doesnt", "isnt", "wasnt", "shouldnt", "wouldnt", "couldnt", "wont", "cant", "dont", ] ], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__(self): super(HansQuestionEmbeddingVerbs, self).__init__( phrase_groups=[ ["wondered", "understood", "knew", "asked", "explained", "realized"] ], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__(self): # Create a fake batch of data self.batch = { "text": [ "The man is walking.", "The man is running.", "The woman is sprinting.", "The woman is resting.", "The hobbit is flying.", "The hobbit is swimming.", ], "label": [0, 0, 1, 1, 0, 0], "z": [1, 0, 1, 0, 1, 0], "fast": [False, True, True, False, False, False], "metadata": [ {"source": "real"}, {"source": "real"}, {"source": "real"}, {"source": "real"}, {"source": "fictional"}, {"source": "fictional"}, ], } # Create a fake dataset self.dataset = Dataset.from_batch( self.batch, identifier=Identifier(_name="MockDataset", version="1.0"), ) # Keep a copy of the original self.original_dataset = deepcopy(self.dataset) assert len(self.dataset) == 6
def __init__(self): super(HansAdverbs, self).__init__( phrase_groups=[ ["quickly", "slowly", "happily", "easily", "quietly", "thoughtfully"] ], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__( self, dataset: Dataset, ): # Call the superclass super(DevBench, self).__init__() # An identifier for the DevBench self.identifier = Identifier("DevBench", dataset=str(dataset.identifier)) # Dataset that the devbench operates on self._dataset = dataset # Create the collection of slices self._slices = set() self._slice_identifiers = set() self._slice_table = {} # The devbench has aggregators self.aggregators = {} # The devbench internally tracks metrics self.metrics = {} # Add slices if any self.add_slices(dataset)
def __init__(self): super(HansAdvsEntailed, self).__init__( phrase_groups=[ ["certainly", "definitely", "clearly", "obviously", "suddenly"] ], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__( self, intervals: List[Tuple[int, int]], metric: Sequence[str] = ("rouge1", "fmeasure"), *args, **kwargs, ): assert ( len(metric) == 2 ), "Must pass in both rouge score and one of precision/recall/fmeasure." super(RougeMatrixScoreSubpopulation, self).__init__( intervals=intervals, identifiers=[ Identifier( _name=self.__class__.__name__, gte=interval[0], lte=interval[1], metric=metric, ) for interval in intervals ], *args, **kwargs, ) # Assign the metric self.metric = metric
def __init__( self, phrases=None, identifiers: List[Identifier] = None, *args, **kwargs ): super(HasPhrase, self).__init__( # One slice per phrase identifiers=[ Identifier(_name=self.__class__.__name__, phrase=phrase) for phrase in phrases ] if not identifiers else identifiers, *args, **kwargs ) # This is the list of phrases that will be searched self.phrases = phrases if self.phrases is None: self.phrases = [] # Create and populate Aho-Corasick automatons for words and phrases self.word_ahocorasick = AhoCorasick.from_phrases( {i: phrase for i, phrase in enumerate(self.phrases) if " " not in phrase} ) self.phrase_ahocorasick = AhoCorasick.from_phrases( {i: phrase for i, phrase in enumerate(self.phrases) if " " in phrase} )
def __init__(self): super(HansNPZVerbs, self).__init__( phrase_groups=[[ "hid", "moved", "presented", "paid", "studied", "stopped" ]], identifiers=[Identifier(_name=self.__class__.__name__)], )
def __init__(self, num_transformed=1, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1): super(EasyDataAugmentation, self).__init__(identifiers=Identifier.range( n=num_transformed, _name=self.__class__.__name__, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=p_rd, )) # Set the parameters self.alpha_sr = alpha_sr self.alpha_ri = alpha_ri self.alpha_rs = alpha_rs self.p_rd = p_rd # Download wordnet self._download_wordnet()
def __init__(self, *args, identifier: Identifier = None, **kwargs): if len(args) == 1 and isinstance(args[0], datasets.Dataset): # Create a Dataset directly from an datasets.Dataset object self.__dict__ = args[0].__dict__.copy() else: super(Dataset, self).__init__(*args, **kwargs) # Call the superclass constructor InteractionTapeHierarchyMixin.__init__(self) self.identifier = (Identifier( _name=self.info.builder_name, split=str(self.split), version=self.version, ) if not identifier else identifier) # Keep track of the original dataset keys self.original_columns = list(self.features.keys()) # Add an index to the dataset dataset = self.map(self.add_index, with_indices=True) self.__dict__.update(dataset.__dict__) # TODO(karan): fix the identifier settings for Dataset if self.identifier is not None and not str( self.identifier).startswith("None"): self.logdir /= str(self.identifier) self.logdir.mkdir(parents=True, exist_ok=True)