def compute(seq_x, seq_y, order=2, verbose=True, truncate=True): """ This function estimates the Effort-To-Compress for a given sequence. It wraps around other functions and executes them based on input options. Parameters ---------- seq : list or tuple Sequence of integers. order : int, optional Number of elements in window for substitution. The default is 2 for pairs. verbose : bool, optional Whether to compute additional metrics. The default is True. Returns ------- dict ETC (int) & optionally, trajectory of NSRWS algorithm if verbose=True. """ assert len(seq_x) == len( seq_y), "ERROR: The 2 sequences should have the same length!" # Create a copy of the original sequence seq_x = cast(seq_x) seq_y = cast(seq_y) if truncate: # If verbose, run the verbose version and return accordingly if verbose: etc, out = _compute_verbose_truncated(seq_x, seq_y, order) return { "ETC2D": etc, "NETC2D": etc / (len(seq_x) - 1), "Trajectory": out } else: # If not verbose, run the compact version and return accordingly etc = _compute_compact_truncated(seq_x, seq_y, order) return {"ETC2D": etc, "NETC2D": etc / (len(seq_x) - 1)} else: # If verbose, run the verbose version and return accordingly if verbose: etc, out = _compute_verbose_full(seq_x, seq_y, order) return { "ETC2D": etc, "NETC2D": etc / (len(seq_x) - 1), "Trajectory": out } else: # If not verbose, run the compact version and return accordingly etc = _compute_compact_full(seq_x, seq_y, order) return {"ETC2D": etc, "NETC2D": etc / (len(seq_x) - 1)}
def generate(size=10, partitions=2, seed=None): """ This function generates discrete random data of desired size and bins. Parameters ---------- size : int, optional Length of sequence to generate. The default is 10. partitions : int, optional Number of bins/paritions to create. seed : int, optional Seed value for initializing the random number generator. The default is None Returns ------- list Collection of integers sampled from discrete uniform. """ if not (isinstance(partitions, int) and isinstance(size, int) and partitions >= 2): print(partitions, size) print(">> Number of bins is invalid ...") return None if seed: seedvalue(seed) return recode.cast(choices(range(1, partitions + 1), k=size))
def compute_save(seq, filename, order=2, truncate=True): """ This function estimates the Effort-To-Compress for a given sequence in verbose mode and writes the trajectory of the NSRWS algorithm to disk. Parameters ---------- seq : list or tuple Sequence of integers. filename : str or Path object Name of output file or path to output file. order : int, optional Number of elements in window for substitution. The default is 2 for pairs. truncate: bool, optional Whether to halt iterative estimation once fully saturated 'axiom' has been reached Returns ------- dict ETC1D (int), NETC1D (float). """ # Create a copy of the original sequence with the appropriate type seq = cast(seq) if truncate: etc, out = _compute_verbose_truncated(seq, order) else: etc, out = _compute_verbose_full(seq, order) # Save the output to a csv file and return save(out, filename) return {"ETC1D": etc, "NETC1D": etc / (len(seq) - 1)}
def compute(seq, order=2, verbose=False, truncate=True): """ Estimate the Effort-To-Compress for a given sequence using the NSRPS algorithm. This function wraps around other functions and switches between them based on input parameters. The default options give the fastest results. Parameters ---------- seq : list or tuple Sequence of integers. order : int, optional Number of elements in window for substitution. The default is 2 for pairs. verbose : bool, optional Whether to compute additional metrics. The default is True. truncate: bool, optional Whether to halt iterative estimation once fully saturated 'axiom' has been reached Returns ------- dict ETC1D (int), NETC1D (float) & optionally, trajectory of algorithm if verbose=True """ # Create a copy of the original sequence with the appropriate type seq = cast(seq) if truncate: # If verbose, run the verbose version and return accordingly if verbose: etc, out = _compute_verbose_truncated(seq, order) return { "ETC1D": etc, "NETC1D": etc / (len(seq) - 1), "Trajectory": out } else: # If not verbose, run the compact version and return accordingly etc = _compute_compact_truncated(seq, order) return {"ETC1D": etc, "NETC1D": etc / (len(seq) - 1)} else: # If verbose, run the verbose version and return accordingly if verbose: etc, out = _compute_verbose_full(seq, order) return { "ETC1D": etc, "NETC1D": etc / (len(seq) - 1), "Trajectory": out } else: # If not verbose, run the compact version and return accordingly etc = _compute_compact_full(seq, order) return {"ETC1D": etc, "NETC1D": etc / (len(seq) - 1)}
def compute_save(seq_x, seq_y, filename, truncate=True, order=2): """ This function estimates the Effort-To-Compress for a given sequence in verbose mode and writes the trajectory of the NSRWS algorithm to disk. Parameters ---------- seq : list or tuple Sequence of integers. filename : str or Path object Name of output file or path to output file. order : int, optional Number of elements in window for substitution. The default is 2 for pairs. Returns ------- dict ETC (int). """ assert len(seq_x) == len( seq_y), "ERROR: The 2 sequences should have the same length!" # Create a copy of the original sequence seq_x = cast(seq_x) seq_y = cast(seq_y) if truncate: etc, out = _compute_verbose_truncated(seq_x, seq_y, order) else: etc, out = _compute_verbose_full(seq_x, seq_y, order) # Save the output to a csv file and return save(out, filename) return {"ETC2D": etc, "NETC2D": etc / (len(seq_x) - 1)}
def compute_complexity(seq): # Coerce input to appropriate array type, if not possible throw a fit & exit if not arraytype(seq): seq = cast(seq) if seq is None: return None # Check whether all elements are equal, & exit if True (LZ76 of such inputs is 2) if core.check_equality(seq): print("> All elements in sequence are equal!") return 2 # Else execute Cython function for computing LZ complexity return core.lzc_a(seq)
def test_cast_valid(x): x = recode.cast(x) assert isinstance(x, array) and x.typecode == "I"
def test_cast_zeroes(): x = recode.cast([0, 0, 0, 0]) assert x is None
def test_cast_invalid(x): x = recode.cast(x) assert x is None
def onestep(seq, order, verbose=True, check=True): """ Execute one step of NSRWS on given sequence and window size. This function exposes the functionality of NSRWS with various checks for inputs and sizes. Wraps around _onestep & for convenience, allows disabling of equality check. Parameters ---------- seq : array.array Discrete symbolic sequence containing 32-bit unsigned integers. order : int Size of window for NSRWS, 2 or greater. verbose : bool, optional Whether to report extra details. These include the frequent pair that was substituted, its counts & total time taken. The default is True. check : bool, optional Check for equality of all symbols in sequence. The default is True. Returns ------- tuple, of the following fixed elements in this order: array.array Discrete symbolic sequence containing 32-bit unsigned integers, with most frequently occurring non-sequentially overlapping window substituted. bool indicator for the state of sequence with all distinct pairs (count=1) optional elements of tuple that depend on verbosity: array.array Frequent window substituted int Number of times the frequent window occurred in the sequence float Time taken to execute step """ # Coerce input to appropriate array type, if not possible throw a fit & exit if not arraytype(seq): seq = cast(seq) if seq is None: return None # Check whether all elements are equal, if requested, & exit if True if check and core.check_equality(seq): print("> All elements in sequence are equal!") return None # Check if size of sequence is shorter than order, exit if True if len(seq) < order: print( "> Sequence input shorter than order!\n> Can't perform substitution ..." ) return None # Else execute one step of NSRWS and return return _onestep(seq, order, verbose)
def _onestep_windows(seq, order, verbose=True): """ Execute one full step of NSRWS with order>=2 for a given sequence Makes use of 2 functions written in Cython & _mask_and_count in the following steps: 1. Find overlapping windows & store their indices as mask -> get_mask_windows() 2. Apply the mask and find most frequent window -> _mask_and_count() 3. Substitute all occurrences of most frequent window -> substitute_windows() This function is different from _onestep_pairs because: 1. This is slower due to more nested loops and checks 2. Of course, it handles the generalized case for different window orders 3. For higher window orders, correctness needs to be proved outside of tests The implementation will benefit from: 1. Decorators for timing 2. Decorators for verbosity of output 3. Cython implementation of the slowest part: _mask_and_count problem: counting windows in C? Parameters ---------- seq : array.array Discrete symbolic sequence containing 32-bit unsigned integers. order : int Size of window for NSRWS, 2 or greater. verbose : bool, optional Whether to report extra details. These include the frequent pair that was substituted, its counts & total time taken. The default is True. Returns ------- tuple, of the following fixed elements: seq : array.array Discrete symbolic sequence containing 32-bit unsigned integers, with most frequently occurring non-sequentially overlapping window substituted. signal : bool indicator for the state of sequence with all distinct pairs (count=1) optional elements of tuple that depend on verbosity: freq_pair : array.array Frequent window substituted count : int Number of times the frequent window occurred in the sequence time_taken : float Time taken to execute step """ # Initialize timer before = perf_counter() # Initialize signal for tracking sequence state with all distinct windows signal = False # Compute mask for overlapping windows mask = core.get_mask_windows(seq, order) # Apply mask and find most frequent window freq_window, count = _mask_and_count(seq, mask, order) # Get value for substitution of the most frequent window with sub_value = 1 + max(seq) # If all distinct windows, substitute the first one & set signal to True if count == 1: out = cast(seq[order - 1:]) out[0] = sub_value signal = True # Else, substitute all instances of the frequent window else: out = cast(core.substitute_windows(seq, order, freq_window, sub_value)) # Completion timer after = perf_counter() # If verbose, return more things if verbose: return out, signal, freq_window, count, after - before # Else return bare essentials return out, signal
def _mask_and_count(seq, mask, order): """ Apply binary mask to a sequence and count most frequently occurring windows This function does 3 things in the following sequence: 1. Create sliding windows of a given size (order) - using zip and islice 2. Apply a supplied mask to the sliding windows - using compress 3. Count most frequently occurring window - using Counter In the NSRWS algorithm, this is the most time consuming step. Essentially expands a 1D sequence to a 2D sequence - where the sequence follows row-wise & the columnar expansion encodes a sliding window for each row: 1D sequence: (1,2,3,4,5,6,7) 2D expansion for window order=3: ((1,2,3), (2,3,4), (3,4,5), (4,5,6), (5,6,7)) The mask is applied row-wise & must be of the same length as the number of rows in this 2D expansion. This is given by: len(mask) = len(seq) - (order - 1) Example application of the mask (1,0,0,1,1): 1 -> ((1,2,3), 0 -> (2,3,4), ----> ((1,2,3), 0 -> (3,4,5), (4,5,6), 1 -> (4,5,6), (5,6,7)) 1 -> (5,6,7)) Unique windows (rows of 2D expansion) are counted and most frequently occurring row is returned with counts. 1D sequence with overlap: (1,1,1,1,1,2,1) 2D expansion for window order=3: ((1,1,1), (1,1,1), ----> overlap (1,1,1), ----> overlap (1,1,2), (1,2,1)) mask will be (1,0,0,1,1) and its application will yield: ((1,1,1), (1,1,2), (1,2,1)) Here, each window occurs once and the first one is returned -> (1,1,1) Parameters ---------- seq : array.array Discrete symbolic sequence containing 32-bit unsigned integers. mask : array.array Collection of Booleans, where 0s indicate locations on "seq" to mask out. 0s correspond to overlapping windows. order : int Size of window for NSRWS, 2 or greater. Returns ------- freq_window : array.array Most frequently occurring non-overlapping "window" of size "order". count : int Number of times the most frequently occurring window occurs. """ # Create overlapped sliding windows (each window a tuple of size order) & apply mask filtered = compress(zip(*(islice(seq, i, None) for i in range(order))), mask) # Count sliding windows (tuples are hashable!) & get the one most common with counts freq_window, count = Counter(filtered).most_common(1)[0] # Assign array type and return freq_window = cast(freq_window) return freq_window, count
def compute(seq_x, seq_y, LEN_past, ADD_meas, STEP_size, n_partitions=False): """ Estimate the Compression-Complexity based Causality for two sequences. The direction of causality being assessed is from seq_y -> seq_x. Various other parameters need to be specified, a brief description is offered below. For detailed explanations regarding the parameters, interpretations and of the inner workings, please refer to the research article along with the supplementary: Kathpalia, Aditi, and Nithin Nagaraj. “Data-Based Intervention Approach for Complexity-Causality Measure.” PeerJ Computer Science 5 (May 2019): e196. https://doi.org/10.7717/peerj-cs.196. Parameters ---------- seq_x : list or tuple Sequence of numbers, if not integers specify n_partitions for binning. seq_y : list or tuple Sequence of numbers, if not integers specify n_partitions for binnings. LEN_past : int Parameter "L": Window length of immediate past values of seq_x and seq_y. ADD_meas : int Parameter "w": Window length of present values of seq_x. Minimal data length over which CC rate can be reliably estimated, application/domain-specific STEP_size : int Parameter "delta": Step-size for sliding chunks across both sequences. An overlap of 20-50% between successive chunks or windows suggested. n_partitions : int or bool, optional Parameter "B": Number of bins. Smalles number of symbols that capture the time series dynamics. The default is False indicating that the data is already in the form of discrete symbolic sequences. Returns ------- CCC : float Estimated Compression-Complexity based Causality for direction seq_y -> seq_x. """ # Sanity checks assert len(seq_x) == len(seq_y), "ERROR: Sequences must have the same length!" assert ( isinstance(LEN_past, int) and LEN_past > 1 ), "ERROR: LEN_past must be a positive integer!" assert ( isinstance(ADD_meas, int) and ADD_meas > 1 ), "ERROR: ADD_meas must be a positive integer!" assert ( isinstance(STEP_size, int) and STEP_size > 1 ), "ERROR: STEP_size must be a positive integer!" # Partition data if requested with the specificed number of bins if n_partitions: seq_x = partition(seq_x, n_partitions) seq_y = partition(seq_y, n_partitions) # Check whether input is a discrete symbolic sequence if not arraytype(seq_x): seq_x = cast(seq_x) if not arraytype(seq_y): seq_y = cast(seq_y) # Set switch for operating differently on native vs numpy arrays if type(seq_x) == np.ndarray or type(seq_y) == np.ndarray: combine = lambda x, y: np.hstack([x, y]) if type(seq_x) == array.array or type(seq_y) == array.array: combine = lambda x, y: x + y # Setup variables LEN = len(seq_x) LEN_to_check = LEN_past + ADD_meas # Initialize aggregators l_1D = [] l_2D = [] # Iterate over chunks of both sequences for k in range(0, LEN - LEN_to_check, STEP_size): ## Compression-Complexity of past values of seq_x # 1D ETC of a chunk of seq_x of length LEN_past ETC1D_ini = get1D(seq_x[k : k + LEN_past])["NETC1D"] ## Compression-Complexity of past values of seq_x and seq_y # 2D ETC of chunks of both seq_x,seq_y of length LEN_past at the same locus ETC2D_ini = get2D(seq_x[k : k + LEN_past], seq_y[k : k + LEN_past],)["NETC2D"] ## Compression-Complexity of present values of seq_x # 1D ETC of a chunk of seq_x of length LEN_to_check ETC1D_fin = get1D(seq_x[k : k + LEN_to_check])["NETC1D"] ## Compression-Complexity of values of seq_x & past of seq_y + present of seq_x # 2D ETC of chunks of both seq_x, seq_y of length LEN_to_check at the same locus ETC2D_fin = get2D( seq_x[k : k + LEN_to_check], combine(seq_y[k : k + LEN_past], seq_x[k + LEN_past : k + LEN_to_check]), )["NETC2D"] # Dynamic Compression-Complexity of seq_x ETC1D_delta = ETC1D_fin - ETC1D_ini # Dynamic Compression Complexity of seq_x conditional on seq_y ETC2D_delta = ETC2D_fin - ETC2D_ini # Aggregate Dynamic CCs l_1D.append(ETC1D_delta) l_2D.append(ETC2D_delta) ## Compute Compession-Complexity Causality # Average of the difference: CC(X | X_past) - CC(X | Y_past + X_present) CCC = (sum(l_1D) - sum(l_2D)) / len(l_1D) # print(f"CCC for seq_y -> seq_x = {CCC}") return CCC