コード例 #1
0
def compute(seq_x, seq_y, order=2, verbose=True, truncate=True):
    """
    This function estimates the Effort-To-Compress for a given sequence. It
    wraps around other functions and executes them based on input options.

    Parameters
    ----------
    seq : list or tuple
        Sequence of integers.
    order : int, optional
        Number of elements in window for substitution.
        The default is 2 for pairs.
    verbose : bool, optional
        Whether to compute additional metrics. The default is True.

    Returns
    -------
    dict
        ETC (int) & optionally, trajectory of NSRWS algorithm if verbose=True.

    """
    assert len(seq_x) == len(
        seq_y), "ERROR: The 2 sequences should have the same length!"

    # Create a copy of the original sequence
    seq_x = cast(seq_x)
    seq_y = cast(seq_y)

    if truncate:
        # If verbose, run the verbose version and return accordingly
        if verbose:
            etc, out = _compute_verbose_truncated(seq_x, seq_y, order)
            return {
                "ETC2D": etc,
                "NETC2D": etc / (len(seq_x) - 1),
                "Trajectory": out
            }
        else:
            # If not verbose, run the compact version and return accordingly
            etc = _compute_compact_truncated(seq_x, seq_y, order)
            return {"ETC2D": etc, "NETC2D": etc / (len(seq_x) - 1)}
    else:
        # If verbose, run the verbose version and return accordingly
        if verbose:
            etc, out = _compute_verbose_full(seq_x, seq_y, order)
            return {
                "ETC2D": etc,
                "NETC2D": etc / (len(seq_x) - 1),
                "Trajectory": out
            }
        else:
            # If not verbose, run the compact version and return accordingly
            etc = _compute_compact_full(seq_x, seq_y, order)
            return {"ETC2D": etc, "NETC2D": etc / (len(seq_x) - 1)}
コード例 #2
0
def generate(size=10, partitions=2, seed=None):
    """
    This function generates discrete random data of desired size and bins.

    Parameters
    ----------
    size : int, optional
        Length of sequence to generate. The default is 10.
    partitions : int, optional
        Number of bins/paritions to create.
    seed : int, optional
        Seed value for initializing the random number generator. The default is None

    Returns
    -------
    list
        Collection of integers sampled from discrete uniform.

    """
    if not (isinstance(partitions, int) and isinstance(size, int) and partitions >= 2):
        print(partitions, size)
        print(">> Number of bins is invalid ...")
        return None

    if seed:
        seedvalue(seed)

    return recode.cast(choices(range(1, partitions + 1), k=size))
コード例 #3
0
def compute_save(seq, filename, order=2, truncate=True):
    """
    This function estimates the Effort-To-Compress for a given sequence in
    verbose mode and writes the trajectory of the NSRWS algorithm to disk.

    Parameters
    ----------
    seq : list or tuple
        Sequence of integers.
    filename : str or Path object
        Name of output file or path to output file.
    order : int, optional
        Number of elements in window for substitution.
        The default is 2 for pairs.
    truncate: bool, optional
        Whether to halt iterative estimation once fully saturated 'axiom' has been reached

    Returns
    -------
    dict
        ETC1D (int), NETC1D (float).

    """
    # Create a copy of the original sequence with the appropriate type
    seq = cast(seq)

    if truncate:
        etc, out = _compute_verbose_truncated(seq, order)
    else:
        etc, out = _compute_verbose_full(seq, order)

    # Save the output to a csv file and return
    save(out, filename)

    return {"ETC1D": etc, "NETC1D": etc / (len(seq) - 1)}
コード例 #4
0
def compute(seq, order=2, verbose=False, truncate=True):
    """
    Estimate the Effort-To-Compress for a given sequence using the NSRPS algorithm.

    This function wraps around other functions and switches between them based on input
    parameters. The default options give the fastest results.

    Parameters
    ----------
    seq : list or tuple
        Sequence of integers.
    order : int, optional
        Number of elements in window for substitution.
        The default is 2 for pairs.
    verbose : bool, optional
        Whether to compute additional metrics. The default is True.
    truncate: bool, optional
        Whether to halt iterative estimation once fully saturated 'axiom' has been reached

    Returns
    -------
    dict
        ETC1D (int), NETC1D (float) & optionally, trajectory of algorithm if verbose=True

    """
    # Create a copy of the original sequence with the appropriate type
    seq = cast(seq)

    if truncate:
        # If verbose, run the verbose version and return accordingly
        if verbose:
            etc, out = _compute_verbose_truncated(seq, order)
            return {
                "ETC1D": etc,
                "NETC1D": etc / (len(seq) - 1),
                "Trajectory": out
            }
        else:
            # If not verbose, run the compact version and return accordingly
            etc = _compute_compact_truncated(seq, order)
            return {"ETC1D": etc, "NETC1D": etc / (len(seq) - 1)}
    else:
        # If verbose, run the verbose version and return accordingly
        if verbose:
            etc, out = _compute_verbose_full(seq, order)
            return {
                "ETC1D": etc,
                "NETC1D": etc / (len(seq) - 1),
                "Trajectory": out
            }
        else:
            # If not verbose, run the compact version and return accordingly
            etc = _compute_compact_full(seq, order)
            return {"ETC1D": etc, "NETC1D": etc / (len(seq) - 1)}
コード例 #5
0
def compute_save(seq_x, seq_y, filename, truncate=True, order=2):
    """
    This function estimates the Effort-To-Compress for a given sequence in
    verbose mode and writes the trajectory of the NSRWS algorithm to disk.

    Parameters
    ----------
    seq : list or tuple
        Sequence of integers.
    filename : str or Path object
        Name of output file or path to output file.
    order : int, optional
        Number of elements in window for substitution.
        The default is 2 for pairs.

    Returns
    -------
    dict
        ETC (int).

    """
    assert len(seq_x) == len(
        seq_y), "ERROR: The 2 sequences should have the same length!"

    # Create a copy of the original sequence
    seq_x = cast(seq_x)
    seq_y = cast(seq_y)

    if truncate:
        etc, out = _compute_verbose_truncated(seq_x, seq_y, order)
    else:
        etc, out = _compute_verbose_full(seq_x, seq_y, order)

    # Save the output to a csv file and return
    save(out, filename)

    return {"ETC2D": etc, "NETC2D": etc / (len(seq_x) - 1)}
コード例 #6
0
def compute_complexity(seq):

    # Coerce input to appropriate array type, if not possible throw a fit & exit
    if not arraytype(seq):
        seq = cast(seq)
        if seq is None:
            return None

    # Check whether all elements are equal, & exit if True (LZ76 of such inputs is 2)
    if core.check_equality(seq):
        print("> All elements in sequence are equal!")
        return 2

    # Else execute Cython function for computing LZ complexity
    return core.lzc_a(seq)
コード例 #7
0
ファイル: test_recode.py プロジェクト: rahulvenugopal/ETCPy
def test_cast_valid(x):

    x = recode.cast(x)

    assert isinstance(x, array) and x.typecode == "I"
コード例 #8
0
ファイル: test_recode.py プロジェクト: rahulvenugopal/ETCPy
def test_cast_zeroes():

    x = recode.cast([0, 0, 0, 0])

    assert x is None
コード例 #9
0
ファイル: test_recode.py プロジェクト: rahulvenugopal/ETCPy
def test_cast_invalid(x):

    x = recode.cast(x)

    assert x is None
コード例 #10
0
def onestep(seq, order, verbose=True, check=True):
    """
    Execute one step of NSRWS on given sequence and window size.

    This function exposes the functionality of NSRWS with various checks for inputs and
    sizes. Wraps around _onestep & for convenience, allows disabling of equality check.

    Parameters
    ----------
    seq : array.array
        Discrete symbolic sequence containing 32-bit unsigned integers.
    order : int
        Size of window for NSRWS, 2 or greater.
    verbose : bool, optional
        Whether to report extra details. These include the frequent pair that was
        substituted, its counts & total time taken. The default is True.
    check : bool, optional
        Check for equality of all symbols in sequence. The default is True.

    Returns
    -------
    tuple, of the following fixed elements in this order:
        array.array
            Discrete symbolic sequence containing 32-bit unsigned integers, with most
            frequently occurring non-sequentially overlapping window substituted.

        bool
            indicator for the state of sequence with all distinct pairs (count=1)

    optional elements of tuple that depend on verbosity:
        array.array
            Frequent window substituted

        int
            Number of times the frequent window occurred in the sequence

        float
            Time taken to execute step

    """

    # Coerce input to appropriate array type, if not possible throw a fit & exit
    if not arraytype(seq):
        seq = cast(seq)
        if seq is None:
            return None

    # Check whether all elements are equal, if requested, & exit if True
    if check and core.check_equality(seq):
        print("> All elements in sequence are equal!")
        return None

    # Check if size of sequence is shorter than order, exit if True
    if len(seq) < order:
        print(
            "> Sequence input shorter than order!\n> Can't perform substitution ..."
        )
        return None

    # Else execute one step of NSRWS and return
    return _onestep(seq, order, verbose)
コード例 #11
0
def _onestep_windows(seq, order, verbose=True):
    """
    Execute one full step of NSRWS with order>=2 for a given sequence

    Makes use of 2 functions written in Cython & _mask_and_count in the following steps:
        1. Find overlapping windows & store their indices as mask -> get_mask_windows()
        2. Apply the mask and find most frequent window -> _mask_and_count()
        3. Substitute all occurrences of most frequent window -> substitute_windows()

    This function is different from _onestep_pairs because:
        1. This is slower due to more nested loops and checks
        2. Of course, it handles the generalized case for different window orders
        3. For higher window orders, correctness needs to be proved outside of tests

    The implementation will benefit from:
        1. Decorators for timing
        2. Decorators for verbosity of output
        3. Cython implementation of the slowest part: _mask_and_count
            problem: counting windows in C?

    Parameters
    ----------
    seq : array.array
        Discrete symbolic sequence containing 32-bit unsigned integers.
    order : int
        Size of window for NSRWS, 2 or greater.
    verbose : bool, optional
        Whether to report extra details. These include the frequent pair that was
        substituted, its counts & total time taken. The default is True.

    Returns
    -------
    tuple, of the following fixed elements:
        seq : array.array
            Discrete symbolic sequence containing 32-bit unsigned integers, with most
            frequently occurring non-sequentially overlapping window substituted.

        signal : bool
            indicator for the state of sequence with all distinct pairs (count=1)

    optional elements of tuple that depend on verbosity:
        freq_pair : array.array
            Frequent window substituted

        count : int
            Number of times the frequent window occurred in the sequence

        time_taken : float
            Time taken to execute step


    """

    # Initialize timer
    before = perf_counter()

    # Initialize signal for tracking sequence state with all distinct windows
    signal = False

    # Compute mask for overlapping windows
    mask = core.get_mask_windows(seq, order)

    # Apply mask and find most frequent window
    freq_window, count = _mask_and_count(seq, mask, order)

    # Get value for substitution of the most frequent window with
    sub_value = 1 + max(seq)

    # If all distinct windows, substitute the first one & set signal to True
    if count == 1:
        out = cast(seq[order - 1:])
        out[0] = sub_value
        signal = True
    # Else, substitute all instances of the frequent window
    else:
        out = cast(core.substitute_windows(seq, order, freq_window, sub_value))

    # Completion timer
    after = perf_counter()

    # If verbose, return more things
    if verbose:
        return out, signal, freq_window, count, after - before

    # Else return bare essentials
    return out, signal
コード例 #12
0
def _mask_and_count(seq, mask, order):
    """
    Apply binary mask to a sequence and count most frequently occurring windows

    This function does 3 things in the following sequence:
        1. Create sliding windows of a given size (order) - using zip and islice
        2. Apply a supplied mask to the sliding windows - using compress
        3. Count most frequently occurring window - using Counter

    In the NSRWS algorithm, this is the most time consuming step. Essentially expands
    a 1D sequence to a 2D sequence - where the sequence follows row-wise & the columnar
    expansion encodes a sliding window for each row:
        1D sequence:
            (1,2,3,4,5,6,7)

        2D expansion for window order=3:
            ((1,2,3),
             (2,3,4),
             (3,4,5),
             (4,5,6),
             (5,6,7))

        The mask is applied row-wise & must be of the same length as the number of rows
        in this 2D expansion. This is given by:
            len(mask) = len(seq) - (order - 1)

        Example application of the mask (1,0,0,1,1):
            1 -> ((1,2,3),
            0 ->  (2,3,4),    ---->      ((1,2,3),
            0 ->  (3,4,5),                (4,5,6),
            1 ->  (4,5,6),                (5,6,7))
            1 ->  (5,6,7))

        Unique windows (rows of 2D expansion) are counted and most frequently occurring
        row is returned with counts.

        1D sequence with overlap:
            (1,1,1,1,1,2,1)

        2D expansion for window order=3:
            ((1,1,1),
             (1,1,1),    ----> overlap
             (1,1,1),    ----> overlap
             (1,1,2),
             (1,2,1))

        mask will be (1,0,0,1,1) and its application will yield:
            ((1,1,1),
             (1,1,2),
             (1,2,1))

        Here, each window occurs once and the first one is returned -> (1,1,1)

    Parameters
    ----------
    seq : array.array
        Discrete symbolic sequence containing 32-bit unsigned integers.
    mask : array.array
        Collection of Booleans, where 0s indicate locations on "seq" to mask out.
        0s correspond to overlapping windows.
    order : int
        Size of window for NSRWS, 2 or greater.

    Returns
    -------
    freq_window : array.array
        Most frequently occurring non-overlapping "window" of size "order".
    count : int
        Number of times the most frequently occurring window occurs.

    """

    # Create overlapped sliding windows (each window a tuple of size order) & apply mask
    filtered = compress(zip(*(islice(seq, i, None) for i in range(order))),
                        mask)

    # Count sliding windows (tuples are hashable!) & get the one most common with counts
    freq_window, count = Counter(filtered).most_common(1)[0]

    # Assign array type and return
    freq_window = cast(freq_window)

    return freq_window, count
コード例 #13
0
ファイル: compute_CCC.py プロジェクト: rahulvenugopal/ETCPy
def compute(seq_x, seq_y, LEN_past, ADD_meas, STEP_size, n_partitions=False):
    """
    Estimate the Compression-Complexity based Causality for two sequences.

    The direction of causality being assessed is from seq_y -> seq_x. Various other
    parameters need to be specified, a brief description is offered below.

    For detailed explanations regarding the parameters, interpretations and of the inner
    workings, please refer to the research article along with the supplementary:
        Kathpalia, Aditi, and Nithin Nagaraj. “Data-Based Intervention Approach for
        Complexity-Causality Measure.” PeerJ Computer Science 5 (May 2019): e196.
        https://doi.org/10.7717/peerj-cs.196.

    Parameters
    ----------
    seq_x : list or tuple
        Sequence of numbers, if not integers specify n_partitions for binning.
    seq_y : list or tuple
        Sequence of numbers, if not integers specify n_partitions for binnings.
    LEN_past : int
        Parameter "L": Window length of immediate past values of seq_x and seq_y.
    ADD_meas : int
        Parameter "w": Window length of present values of seq_x. Minimal data length
        over which CC rate can be reliably estimated, application/domain-specific
    STEP_size : int
        Parameter "delta": Step-size for sliding chunks across both sequences. An overlap
        of 20-50% between successive chunks or windows suggested.
    n_partitions : int or bool, optional
        Parameter "B": Number of bins. Smalles number of symbols that capture the time
        series dynamics. The default is False indicating that the data is already in the
        form of discrete symbolic sequences.

    Returns
    -------
    CCC : float
        Estimated Compression-Complexity based Causality for direction seq_y -> seq_x.

    """
    # Sanity checks
    assert len(seq_x) == len(seq_y), "ERROR: Sequences must have the same length!"
    assert (
        isinstance(LEN_past, int) and LEN_past > 1
    ), "ERROR: LEN_past must be a positive integer!"
    assert (
        isinstance(ADD_meas, int) and ADD_meas > 1
    ), "ERROR: ADD_meas must be a positive integer!"
    assert (
        isinstance(STEP_size, int) and STEP_size > 1
    ), "ERROR: STEP_size must be a positive integer!"

    # Partition data if requested with the specificed number of bins
    if n_partitions:
        seq_x = partition(seq_x, n_partitions)
        seq_y = partition(seq_y, n_partitions)

    # Check whether input is a discrete symbolic sequence
    if not arraytype(seq_x):
        seq_x = cast(seq_x)
    if not arraytype(seq_y):
        seq_y = cast(seq_y)

    # Set switch for operating differently on native vs numpy arrays
    if type(seq_x) == np.ndarray or type(seq_y) == np.ndarray:
        combine = lambda x, y: np.hstack([x, y])
    if type(seq_x) == array.array or type(seq_y) == array.array:
        combine = lambda x, y: x + y

    # Setup variables
    LEN = len(seq_x)
    LEN_to_check = LEN_past + ADD_meas

    # Initialize aggregators
    l_1D = []
    l_2D = []

    # Iterate over chunks of both sequences
    for k in range(0, LEN - LEN_to_check, STEP_size):

        ## Compression-Complexity of past values of seq_x
        # 1D ETC of a chunk of seq_x of length LEN_past
        ETC1D_ini = get1D(seq_x[k : k + LEN_past])["NETC1D"]

        ## Compression-Complexity of past values of seq_x and seq_y
        # 2D ETC of chunks of both seq_x,seq_y of length LEN_past at the same locus
        ETC2D_ini = get2D(seq_x[k : k + LEN_past], seq_y[k : k + LEN_past],)["NETC2D"]

        ## Compression-Complexity of present values of seq_x
        # 1D ETC of a chunk of seq_x of length LEN_to_check
        ETC1D_fin = get1D(seq_x[k : k + LEN_to_check])["NETC1D"]

        ## Compression-Complexity of values of seq_x & past of seq_y + present of seq_x
        # 2D ETC of chunks of both seq_x, seq_y of length LEN_to_check at the same locus
        ETC2D_fin = get2D(
            seq_x[k : k + LEN_to_check],
            combine(seq_y[k : k + LEN_past], seq_x[k + LEN_past : k + LEN_to_check]),
        )["NETC2D"]

        # Dynamic Compression-Complexity of seq_x
        ETC1D_delta = ETC1D_fin - ETC1D_ini

        # Dynamic Compression Complexity of seq_x conditional on seq_y
        ETC2D_delta = ETC2D_fin - ETC2D_ini

        # Aggregate Dynamic CCs
        l_1D.append(ETC1D_delta)
        l_2D.append(ETC2D_delta)

    ## Compute Compession-Complexity Causality
    # Average of the difference: CC(X | X_past) - CC(X | Y_past + X_present)
    CCC = (sum(l_1D) - sum(l_2D)) / len(l_1D)
    # print(f"CCC for seq_y -> seq_x = {CCC}")
    return CCC