def _pump(self, target, smaller_size_array, a_is_smaller): """Execute the Sub tank (operation) in the pump (backward) direction. Parameters ---------- target: np.ndarray The result of a-b. smaller_size_array: np.ndarray Either 'a' or 'b' depending on which has fewer elements. a_is_smaller: bool Whether or not 'a' is the smaller size array. Returns ------- dict( a: np.ndarray The object to subtract something from. b: np.ndarray The object which substracts from something else. ) """ # Reconstruct the larger array from the smaller size array nd the target. if a_is_smaller: a = ut.maybe_copy(smaller_size_array) b = np.array(a - target) else: a = np.array(target + smaller_size_array) b = ut.maybe_copy(smaller_size_array) return {'a': a, 'b': b}
def _pump(self, target, smaller_size_array, a_is_smaller, missing_vals): """Execute the Mul tank (operation) in the pump (backward) direction. Parameters ---------- target: np.ndarray The result of a*b smaller_size_array: np.ndarray Either 'a' or 'b' depending on which has fewer elements. a_is_smaller: bool Whether a is the smaller sized array. missing_vals: np.ndarray The values from either 'a' or 'b' that were lost when the other array had a zero in that location. Returns ------- dict( a: np.ndarray The first array to be multiplied b: np.ndarray The second array to be multiplied ) """ # Find the value of the larger array using target and the smaller array. # Fill in any missing values which occured when there was a zero involved. if a_is_smaller: a = ut.maybe_copy(smaller_size_array) b = np.array(target / a) b[target == 0] = missing_vals else: a = np.array(target / smaller_size_array) b = ut.maybe_copy(smaller_size_array) a[target == 0] = missing_vals return {'a': a, 'b': b}
def _pump(self, target, smaller_size_array, a_is_smaller): """Execute the CatToIndex tank (operation) in the pump (backward) direction. Parameters ---------- target: np.ndarray of ints The indices of all the corresponding category values from 'cats'. cat_to_index_map: dict The mapping from category value to index. Must be one to one and contain all indices from zero to len(cat_to_index_map) - 1 missing_vals: list of category values All the category values from 'cats' which were not found in cat_to_index_map. input_dtype: a numpy dtype The dtype of the inputted 'cats' array. Returns ------- dict( cats: np.ndarray The array with all the category values to map to indices. cat_to_index_map: dict The mapping from category value to index. Must be one to one and contain all indices from zero to len(cat_to_index_map) - 1 ) """ # reconstruct the other array from the smaller size array nd the target. if a_is_smaller: a = ut.maybe_copy(smaller_size_array) b = np.array(target - a) else: a = np.array(target - smaller_size_array) b = ut.maybe_copy(smaller_size_array) return {'a': a, 'b': b}
def _pour(self, a, b): """Execute the Mul tank (operation) in the pour (forward) direction. Parameters ---------- a: np.ndarray The first array to be multiplied b: np.ndarray The second array to be multiplied Returns ------- dict( target: np.ndarray The result of a*b smaller_size_array: np.ndarray Either 'a' or 'b' depending on which has fewer elements. a_is_smaller: bool Whether a is the smaller sized array. missing_vals: np.ndarray The values from either 'a' or 'b' that were lost when the other array had a zero in that location. ) """ # If a or b is not a numpy array, then cast them to it. if type(a) is not np.ndarray: a = np.array(a) if type(b) is not np.ndarray: b = np.array(b) # Save the array which has a fewer number of elements. Since we can # reconstruct the original shape of the larger array from the target. a_is_smaller = a.size < b.size if a_is_smaller: smaller_size_array = ut.maybe_copy(a) else: smaller_size_array = ut.maybe_copy(b) # Multiply them together and save all the values which were effectively # 'erased' by a corresponding zero in the smaller array. We don't need to # to it for the other array since the smaller sized array is going to be # saved anyway. target = np.array(a * b) if a_is_smaller: missing_vals = b[target == 0] else: missing_vals = a[target == 0] return { 'target': target, 'smaller_size_array': smaller_size_array, 'a_is_smaller': a_is_smaller, 'missing_vals': missing_vals }
def _pump(self, target): """Execute the Clone tank (operation) in the pump (backward) direction. Parameters ---------- target: object The merged object. Simply takes the value of the first in the list. Returns ------- dict( a0: object zeroth equal object a1: object first equal object, . . . ) """ kwargs = {} for key in self.slot_keys: kwargs[key] = ut.maybe_copy(target) return kwargs
def _pour(self, **kwargs): """Execute the MergeEqual tank (operation) in the pour (forward) direction. Parameters ---------- a0: object zeroth equal object a1: object first equal object, . . . Returns ------- dict( a: type of slot 'a'target: object The merged object. Simply takes the value of the first in the list. ) """ if self.test_equal: for key in kwargs: if not np.all(kwargs[key] == kwargs['a0']): raise ValueError( "All arguments passed to merge_equal must be equal. Got " + str(kwargs[key]) + ' and ' + str(kwargs['a0'])) return {'target': ut.maybe_copy(kwargs['a0'])}
def _pump(self, **kwargs): """Execute the Clone tank (operation) in the pump (backward) direction. Parameters ---------- a0: object zeroth clone a1: object first clone, . . . Returns ------- dict( a: object The object to be cloned into two. ) """ r_dict = {} for key in self.slot_keys: r_dict['a'] = ut.maybe_copy(kwargs[key]) break return r_dict
def _pour(self, a): """Execute the Clone tank (operation) in the pour (forward) direction. Parameters ---------- a: object The object to be cloned into two. num: int > 0 The number of clones Returns ------- dict( a0: object zeroth clone a1: object first clone, . . . ) """ r_dict = {} for key in self.tube_keys: r_dict[key] = ut.maybe_copy(a) return r_dict
def _pump(self, target, smaller_size_array, a_is_smaller, missing_vals, remainder): """Execute the Div tank (operation) in the pump (backward) direction. Parameters ---------- target: np.ndarray The result of a/b smaller_size_array: np.ndarray Either 'a' or 'b' depending on which has fewer elements. a_is_smaller: bool Whether a is the smaller sized array. missing_vals: np.ndarray The values from either 'a' or 'b' that were lost when the other array had a zero in that location. remainder: np.ndarray The remainder of a/b in the case that 'a' and 'b' are of integer type. Returns ------- dict( a: np.ndarray The numerator array. b: np.ndarray The denominator array ) """ if a_is_smaller: # If a is the smaller of the two arrays, then it was the one that was # saved. So no need to worry about the remainder. a = ut.maybe_copy(smaller_size_array) b = np.array(a / target) b[(target == 0)] = missing_vals else: a = target * smaller_size_array if target.dtype in (np.int32, np.int64): a = np.array(a + remainder) b = ut.maybe_copy(smaller_size_array) # If b is the smaller array then it is the one that was saved. This means # a nan, negative infinity, or positive infinity, (i.e. zeros in b) # correspond to the missing values in a. a[np.isposinf(target) | np.isneginf(target) | np.isnan(target)] = missing_vals return {'a': a, 'b': b}
def _pour(self, a, b): """Execute the CatToIndex tank (operation) in the pour (forward) direction. Parameters ---------- cats: np.ndarray The array with all the category values to map to indices. cat_to_index_map: dict The mapping from category value to index. Must be one to one and contain all indices from zero to len(cat_to_index_map) - 1 Returns ------- dict( target: np.ndarray of ints The indices of all the corresponding category values from 'cats'. cat_to_index_map: dict The mapping from category value to index. Must be one to one and contain all indices from zero to len(cat_to_index_map) - 1 missing_vals: list of category values All the category values from 'cats' which were not found in cat_to_index_map. input_dtype: a numpy dtype The dtype of the inputted 'cats' array. ) """ # Convert to nump arrays if type(a) is not np.ndarray: a = np.array(a) if type(b) is not np.ndarray: b = np.array(b) # Copy whichever has a fewer number of elements and pass as output a_is_smaller = a.size < b.size if a_is_smaller: smaller_size_array = ut.maybe_copy(a) else: smaller_size_array = ut.maybe_copy(b) target = np.array(a + b) return { 'target': target, 'smaller_size_array': smaller_size_array, 'a_is_smaller': a_is_smaller }
def _pour(self, a, bs, selector): if a.shape != selector.shape: raise ValueError( "Shape of a and selector must match. Got {} and {}".format( a.shape, selector.shape)) uniques = np.unique(selector) target = np.zeros(a.shape, dtype=bool) for unique in uniques: mask = selector == unique target[mask] = np.isin(a[mask], bs[unique]) return { 'target': target, 'a': ut.maybe_copy(a), 'bs': ut.maybe_copy(bs), 'selector': ut.maybe_copy(selector) }
def _pour(self, a): """Execute the Clone tank (operation) in the pour (forward) direction. Parameters ---------- a: object The object to be cloned into two. Returns ------- dict( a: type of slot 'a' The first of the two cloned objects. b: type of slot 'a' The second of the two cloned objects. ) """ return {'a': ut.maybe_copy(a), 'b': ut.maybe_copy(a)}
def _pour(self, a, b): """Execute the Sub tank (operation) in the pour (forward) direction. Parameters ---------- a: np.ndarray The object to subtract something from. b: np.ndarray The object which substracts from something else. Returns ------- dict( target: np.ndarray The result of a-b. smaller_size_array: np.ndarray Either 'a' or 'b' depending on which has fewer elements. a_is_smaller: bool Whether or not 'a' is the smaller size array. ) """ # Convert to nump arrays if type(a) is not np.ndarray: a = np.array(a) if type(b) is not np.ndarray: b = np.array(b) # Copy whichever has a fewer number of elements and pass as output a_is_smaller = a.size < b.size if a_is_smaller: smaller_size_array = ut.maybe_copy(a) else: smaller_size_array = ut.maybe_copy(b) target = np.array(a - b) return { 'target': target, 'smaller_size_array': smaller_size_array, 'a_is_smaller': a_is_smaller }
def _pour(self, a, axis): # If an empty tuple was given then set the axis to None if not np.array(axis).size: input_axis = None else: input_axis = axis axis = np.array(axis) # Reduce the array using the supplied numpy array function. target = np_func(a, axis=input_axis) return {'target': target, 'a': ut.maybe_copy(a), 'axis': axis}
def _pour(self, a, mask, replace_with): """Execute the Replace tank (operation) in the pour (forward) direction. Parameters ---------- a: np.ndarray The array which has values that are to be replaced. mask: np.ndarray of bools An array of booleans whose True values denote which of array 'a's values are to be replaced. replace_with: np.ndarray The values to be used to replace the corresponding values in 'a'. Returns ------- dict( target: np.ndarray of same type as 'a' The array with the necessary values replaced. mask: np.ndarray of bools An array of booleans whose True values denote which of array 'a's values are to be replaced. replaced_vals: np.ndarray of same type as 'a' The values that were overwritten when they were replaced by the replace_with values. replace_with_shape: list of ints The original shape of the replace_with array. ) """ self.mask = mask # Cast the replace_with values to an array. replace_with = np.array(replace_with) target = ut.maybe_copy(a) # Save the values that are going to be replaced. replaced_vals = af.empty_array_like(a) replaced_vals[mask] = target[mask] # if len(replace_with.shape) != 1: # raise ValueError("replace_with must be numpy array of rank 1, Got {} ".format(replace_with.shape)) # if int(np.sum(mask)) != int(replace_with.size): # raise ValueError("Number of values to be replaced needs to match the size of replace_with. Got: {} and {}".format(np.sum(mask), replace_with.size)) # Replace the values with the values found in replace_with. target[mask] = replace_with return { 'target': target, 'mask': mask, 'replaced_vals': replaced_vals, 'replace_with': replace_with }
def _pump(self, target, removed, num_tries, ends, random_seed, segment_ids, is_random_next): """Execute the Shape tank (operation) in the pump (backward) direction. Parameters ---------- target: np.ndarray The array a with the [SEP] and [CLS] tags as well a some randomly overwritten second sentences. removed: np.ndarray A array with the same size as target that contains all the substrings that were overwritten. ends: np.ndarray of bools An array of the same shape as 'a' which marks the end of a sentence with a True. num_tries: int The number of times to try and find a random sentences to replace the second part of the 'a' array. segment_ids: np.ndarray An array of zeros and ones with the same shape as 'a' which says whether the token is part of the first sentence or the second. is_random_next: np.ndarray An array of bools which says whether the second sentence was replaced with a random sentence. random_seed: int The random seed. Returns ------- dict( a: np.ndarray The array that will have the [SEP] and [CLS] tags inserted as well as randomly setting half of the rows to having random sentences after the first [SEP] tag. ends: np.ndarray of bools An array of the same shape as 'a' which marks the end of a sentence with a True. num_tries: int The number of times to try and find a random sentences to replace the second part of the 'a' array. random_seed: int The random seed. ) """ mask = removed != '[NA]' a = ut.maybe_copy(target) a[mask] = removed[mask] a = a[~np.isin(a, ['[CLS]', '[SEP]'])] a = np.reshape(a, list(target.shape[:-1]) + [target.shape[-1] - 3]) return { 'a': a, 'num_tries': num_tries, 'ends': ends, 'random_seed': random_seed }
def _pump(self, target, mask, replaced_vals, replace_with_shape): """Execute the Replace tank (operation) in the pump (backward) direction. Parameters ---------- target: np.ndarray of same type as 'a' The array with the necessary values replaced. mask: np.ndarray of bools An array of booleans whose True values denote which of array 'a's values are to be replaced. replaced_vals: np.ndarray of same type as 'a' The values that were overwritten when they were replaced by the replace_with values. replace_with_shape: list of ints The original shape of the replace_with array. Returns ------- dict( a: np.ndarray The array which has values that are to be replaced. mask: np.ndarray of bools An array of booleans whose True values denote which of array 'a's values are to be replaced. replace_with: np.ndarray The values to be used to replace the corresponding values in 'a'. ) """ a = ut.maybe_copy(target) replace_with = a[mask] a[mask] = replaced_vals[mask] if mask.any(): # If the replace_with had any shape then find the number of elements. # Otherwise it's just a scalar and has one element if replace_with_shape: num_elements = np.prod(replace_with_shape) else: num_elements = 1 # If there was only one element then just save the replace_with value # as the first element. Reshape it so it matches it's former shape. if num_elements == 1: replace_with = replace_with.flatten()[0].reshape(replace_with_shape) else: # Otherwise the replace_with_shape is actually the replace_with values. replace_with = replace_with_shape[0] a = a.astype(replaced_vals.dtype.type) return {'a': a, 'mask': mask, 'replace_with': replace_with}
def _pour(self, a, mask, replace_with): """Execute the Replace tank (operation) in the pour (forward) direction. Parameters ---------- a: np.ndarray The array which has values that are to be replaced. mask: np.ndarray of bools An array of booleans whose True values denote which of array 'a's values are to be replaced. replace_with: np.ndarray The values to be used to replace the corresponding values in 'a'. Returns ------- dict( target: np.ndarray of same type as 'a' The array with the necessary values replaced. mask: np.ndarray of bools An array of booleans whose True values denote which of array 'a's values are to be replaced. replaced_vals: np.ndarray of same type as 'a' The values that were overwritten when they were replaced by the replace_with values. replace_with_shape: list of ints The original shape of the replace_with array. ) """ # Cast the replace_with values to an array. replace_with = np.array(replace_with) target = ut.maybe_copy(a) # Save the values that are going to be replaced. replaced_vals = af.empty_array_like(a) replaced_vals[mask] = target[mask] # Replace the values with the values found in replace_with. target[mask] = replace_with # If the mask is all false then save the actual replace_with values, since # that information would otherwise be lost. Otherwise just save the shape. if mask.any(): replace_with_shape = replace_with.shape else: replace_with_shape = (replace_with,) return {'target': target, 'mask': mask, 'replaced_vals': replaced_vals, 'replace_with_shape': replace_with_shape}
def _pour(self, a): """ Parameters ---------- a: np.ndarray The array to get the shape of Returns ------- dict( target: list of ints The shape of the array. a: np.ndarray The array to get the shape of ) """ return {'target': list(a.shape), 'a': ut.maybe_copy(a)}
def _pump(self, target, a): """Execute the Shape tank (operation) in the pump (backward) direction. Parameters ---------- target: list of ints The shape of the array. a: np.ndarray The array to get the shape of Returns ------- dict( a: np.ndarray The array to get the shape of ) """ return {'a': ut.maybe_copy(a)}
def _pour(self, a, default_val): """ Parameters ---------- a: np.ndarray The array to get the effective length of. default_val: The value to not count Returns ------- dict( target: np.ndarray An array of the same shape as 'a' except missing the last dimension. The values are effective lengths of the last dimesion of a. a: np.ndarray The array to get the effective length of. default_val: The value to not count ) """ zero = (np.array(a) == default_val) all_zero = np.all(zero, axis=-1) not_zero = ~zero reversed_last_dim = not_zero[..., ::-1] lengths = np.argmax(reversed_last_dim, axis=-1) lengths = a.shape[-1] - lengths lengths[all_zero] = 0 return { 'target': lengths, 'a': ut.maybe_copy(a), 'default_val': default_val }
def _pump(self, target, a, axis): """Execute the Shape tank (operation) in the pump (backward) direction. Parameters ---------- target: list of ints The shape of the array. a: np.ndarray The array to get the shape of axis: int The axis to get the dim_size from. Returns ------- dict( a: np.ndarray The array to get the shape of axis: int The axis to get the dim_size from. ) """ return {'a': ut.maybe_copy(a), 'axis': axis}
def _pour(self, a, axis): """ Parameters ---------- a: np.ndarray The array to get the shape of axis: int The axis to get the dim_size from. Returns ------- dict( target: list of ints The shape of the array. a: np.ndarray The array to get the shape of axis: int The axis to get the dim_size from. ) """ return {'target': a.shape[axis], 'a': ut.maybe_copy(a), 'axis': axis}
def _pump(self, target, a, key): """Execute the Shape tank (operation) in the pump (backward) direction. Parameters ---------- target: object The value returned from the __getitem__ call to 'a'. a: object The object to getitem from. key: hashable The key to pass to the getitem Returns ------- dict( a: object The object to getitem from. key: hashable The key to pass to the getitem ) """ return {'a': ut.maybe_copy(a), 'key': key}
def _pour(self, a, key): """ Parameters ---------- a: object The object to getitem from. key: hashable The key to pass to the getitem Returns ------- dict( target: object The value returned from the __getitem__ call to 'a'. a: object The object to getitem from. key: hashable The key to pass to the getitem ) """ return {'target': a[key], 'a': ut.maybe_copy(a)}
def _pump(self, target, mask, replaced_vals, replace_with): """Execute the Replace tank (operation) in the pump (backward) direction. Parameters ---------- target: np.ndarray of same type as 'a' The array with the necessary values replaced. mask: np.ndarray of bools An array of booleans whose True values denote which of array 'a's values are to be replaced. replaced_vals: np.ndarray of same type as 'a' The values that were overwritten when they were replaced by the replace_with values. replace_with_shape: list of ints The original shape of the replace_with array. Returns ------- dict( a: np.ndarray The array which has values that are to be replaced. mask: np.ndarray of bools An array of booleans whose True values denote which of array 'a's values are to be replaced. replace_with: np.ndarray The values to be used to replace the corresponding values in 'a'. ) """ a = ut.maybe_copy(target) replaced_vals = np.array(replaced_vals) if replaced_vals.dtype.itemsize > a.dtype.itemsize: a = a.astype(replaced_vals.dtype) if replaced_vals.size == 1: a[mask] = replaced_vals else: a[mask] = replaced_vals[mask] a = a.astype(replaced_vals.dtype.type) return {'a': a, 'mask': mask, 'replace_with': replace_with}
def _pump(self, target, a, default_val): """Execute the Shape tank (operation) in the pump (backward) direction. Parameters ---------- target: np.ndarray An array of the same shape as 'a' except missing the last dimension. The values are effective lengths of the last dimesion of a. a: np.ndarray The array to get the effective length of. default_val: The value to not count Returns ------- dict( a: np.ndarray The array to get the effective length of. default_val: The value to not count ) """ return {'a': ut.maybe_copy(a), 'default_val': default_val}
def _pour(self, strings, tokenizer, max_len, detokenizer): """Execute the Tokenize tank (operation) in the pour (forward) direction. Parameters ---------- strings: np.ndarray of strings The array of strings to tokenize. tokenizer: func Function which converts a string into a list of strings. detokenizer: func Function which takens in a list of tokens and returns a string. Not strictly necessary but it makes the tube 'diff' much smaller if it's close to the real method of detokenizing. max_len: int The maximum number of tokens. Defines the size of the added dimension. Returns ------- dict( target: np.ndarray The array of tokenized strings. Will have rank = rank('a') + 1 where the last dimesion will have size max_len. tokenizer: func Function which converts a string into a list of strings. detokenizer: func Function which takens in a list of tokens and returns a string. Not strictly necessary but it makes the tube 'diff' much smaller if it's close to the real method of detokenizing. diff: np.ndarray of strings The array of strings which define the differences between the original string and the string that has been tokenized then detokenized. ) """ # Convert to a numpy array. strings = np.array(strings) # print detokenizer('I went on a run yesterday. I saw a bird and it was magnificent. I hope to see one again tomorrow.') # Handle the empty array case if not strings.size: return {'target': ut.maybe_copy(strings), 'diff': ut.maybe_copy(strings), 'tokenizer': tokenizer, 'detokenizer': detokenizer} all_tokens = [] all_diffs = [] lengths = [] for string in strings.flatten(): # Tokenize the string, and regularize the length of the array by padding # with '' to fill out the array if it's too small or truncated if it's # too long. tokens = np.array(tokenizer(string)) lengths.append(len(tokens)) if tokens.size < max_len: num = max_len - tokens.size tokens = np.concatenate([tokens, np.full([num], '')]) else: tokens = tokens[:max_len] all_tokens.append(tokens) # Detokenize the tokens and reconstruct the orignal string from the # diff_string processed = detokenizer(tokens) diff = di.get_diff_string(processed, string) all_diffs.append(np.array(diff, dtype=np.unicode)) # Combine all the tokens arrays into a single array and reshape to the # shape of the original strings array with an additional dimesion of size # max_len. token_array = np.stack(all_tokens) target = np.reshape(token_array, list(strings.shape) + [max_len]) # Keep all the string diffs and reshape it to match the original strings # array shape. diff_array = np.stack(all_diffs) diff = np.reshape(diff_array, strings.shape) return {'target': target, 'diff': diff, 'tokenizer': tokenizer, 'detokenizer': detokenizer}
def _pour(self, a, b): """Execute the Div tank (operation) in the pour (forward) direction. Parameters ---------- a: np.ndarray The numerator array. b: np.ndarray The denominator array Returns ------- dict( target: np.ndarray The result of a/b smaller_size_array: np.ndarray Either 'a' or 'b' depending on which has fewer elements. a_is_smaller: bool Whether a is the smaller sized array. missing_vals: np.ndarray The values from either 'a' or 'b' that were lost when the other array had a zero in that location. remainder: np.ndarray The remainder of a/b in the case that 'a' and 'b' are of integer type. ) """ # If they aren't numpy arrays then cast them to arrays. if type(a) is not np.ndarray: a = np.array(a) if type(b) is not np.ndarray: b = np.array(b) # Find the array with fewer elements and save that. a_is_smaller = a.size < b.size if a_is_smaller: smaller_size_array = ut.maybe_copy(a) else: smaller_size_array = ut.maybe_copy(b) # Do the division target = np.array(a / b) # Save the values of the larger array whose values are erased by a zero in # the smaller array if a_is_smaller: missing_vals = b[(target == 0)] else: missing_vals = a[np.isposinf(target) | np.isneginf(target) | np.isnan(target)] # Don't allowed integer division by zero. if a.dtype in (np.int32, np.int64) and b.dtype in (np.int32, np.int64): if (b == 0).any(): raise ZeroDivisionError( "Integer division by zero is not supported.") remainder = np.array(np.remainder(a, b)) else: remainder = np.array([], dtype=target.dtype) return { 'target': target, 'smaller_size_array': smaller_size_array, 'a_is_smaller': a_is_smaller, 'missing_vals': missing_vals, 'remainder': remainder }
def _pour(self, strings, ids, tokenizer, detokenizer=lambda a: ' '.join(a)): """Execute the FlatTokenize tank (operation) in the pour (forward) direction. Parameters ---------- strings: np.ndarray of strings The array of strings to tokenize. tokenizer: func Function which converts a string into a list of strings. detokenizer: func Function which takens in a list of tokens and returns a string. Not strictly necessary but it makes the tube 'diff' much smaller if it's close to the real method of detokenizing. ids: np.ndarray An array of ids which uniquely identify each element of 'strings'. Necessary in order to reconstruct strings since all information about axis is lost when flattened. Each id from ids must be unique.The array of is the same shape as strings Returns ------- dict( target: np.ndarray A one dimensional array of tokens. tokenizer: func Function which converts a string into a list of strings. detokenizer: func Function which takens in a list of tokens and returns a string. Not strictly necessary but it makes the tube 'diff' much smaller if it's close to the real method of detokenizing. diff: np.ndarray of strings The array of strings which define the differences between the original string and the string that has been tokenized then detokenized. shape: list of ints The shape of the inputted array. ids: np.ndarray An array of ids which uniquely identify each element of 'strings'. Necessary in order to reconstruct strings. The array of is the same shape as target ) """ strings = np.array(strings) # Guard for the empty array case if not strings.size: return { 'target': ut.maybe_copy(strings), 'diff': ut.maybe_copy(strings), 'tokenizer': tokenizer, 'detokenizer': detokenizer } all_tokens = [] all_diffs = [] # Go through each element of the string array, and it's corresponding id. r_ids = [] for string_id, string in zip(ids.flatten(), strings.flatten()): # Tokenize the string and add it to the long list of all the tokens. tokens = tokenizer(string) all_tokens.extend(tokens) # Copy the string id len(tokens) times so that the ids always have # the same length as the tokens. This makes it more suitable for breaking # up in downstream tanks. r_ids.extend([string_id] * len(tokens)) # Find the string diff after detokenizing the tokens. processed = detokenizer(tokens) diff = di.get_diff_string(processed, string) # Copy the diff len(tokens) times so that it always has the same size # as tokens. This makes it more suitable for breaking up in downstream # tanks. all_diffs.extend([diff] * len(tokens)) target = np.array(all_tokens).astype(strings.dtype) diff = np.array(all_diffs).astype(strings.dtype) r_ids = np.array(r_ids) return { 'target': target, 'diff': diff, 'tokenizer': tokenizer, 'detokenizer': detokenizer, 'ids': r_ids, 'shape': strings.shape }