Beispiel #1
0
def entropy(seq, legacy=False):
    """
    This function computes Shannon Entropy of a given sequence.

    Parameters
    ----------
    seq : list or tuple
        Sequence of integers.

    Returns
    -------
    float
        Shannon entropy of sequence.

    """

    if isinstance(seq, array) and seq.typecode == "I" and not legacy:
        return estimates.entropy(seq)

    # Get counts from Counter, normalize by total, transform each and sum all
    return sum(
        -seq * log2(seq) for seq in (elem / len(seq) for elem in Counter(seq).values())
    )
Beispiel #2
0
def _compute_verbose_truncated(seq_x, seq_y, order=2):
    """
    This function runs the NSRWS algorithm for estimation of ETC and extracts
    additional metrics at each step of the algorithm. These include:
        - length of sequence
        - entropy of sequence
        - most frequent window
        - count of most frequent window

    The NSRWS algorithm is run iteratively until all elements are equal or the
    sequence has been reduced to a size smaller than the size of the window
    being substituted (specified by order). The number of steps taken till the
    iteration stops is the Effort-To-Compress (ETC) estimate for the sequence.

    Parameters
    ----------
    seq : list or tuple
        Sequence of integers.
    order : int, optional
        Number of elements in window for substitution.
        The default is 2 for pairs.

    Returns
    -------
    etc : int
        Effort-To-Compress estimate for given seq and order.
    output : list
        List of dictionaries corresponding to each step of NSRWS run during
        estimation of ETC for the given sequence.

    """
    # Initialize ETC to 0
    etc = 0

    # Initialize an aggregator for collecting dictionaries of estimates
    output = list()

    signal = False

    # Append estimates for original sequence
    output.append({
        "step": etc,
        "length": len(seq_x),
        "entropy_x": ce.entropy(seq_x),
        "entropy_y": ce.entropy(seq_y),
        "window_x": None,
        "window_y": None,
        "count": None,
        "time": None,
    })

    if cc.check_equality(seq_x, seq_y):
        return etc, output

    # Execute iteration loop until either all elements are equal or sequence is
    # reduced to less than size of the window being substituted (order)
    while not signal and len(seq_x) >= order and not cc.check_equality(
            seq_x, seq_y):

        # Run one step of NSRWS in verbose mode (returns window and count)
        seq_x, seq_y, signal, pair_x, pair_y, count, time = _onestep(
            seq_x, seq_y, order, verbose=True)

        # Increment ETC
        etc += 1

        # Compute estimates and append to aggregator
        output.append({
            "step": etc,
            "length": len(seq_x),
            "entropy_x": ce.entropy(seq_x),
            "entropy_y": ce.entropy(seq_y),
            "window_x": pair_x,
            "window_y": pair_y,
            "count": count,
            "time": time,
        })
    n = 0
    if signal and not cc.check_equality(seq_x, seq_y):

        while len(seq_x) >= order and n < 5:
            # Run one step of NSRWS in verbose mode (returns window and count)
            seq_x, seq_y, signal, pair_x, pair_y, count, time = _onestep(
                seq_x, seq_y, order, verbose=True)

            # Increment ETC
            etc += 1

            # Compute estimates and append to aggregator
            output.append({
                "step": etc,
                "length": len(seq_x),
                "entropy_x": ce.entropy(seq_x),
                "entropy_y": ce.entropy(seq_y),
                "window_x": pair_x,
                "window_y": pair_y,
                "count": count,
                "time": time,
            })
            n += 1
        if len(seq_x) % (order - 1) == 0:
            etc += len(seq_x) // (order - 1) - 1
        else:
            etc += len(seq_x) // (order - 1)

    # Display ETC and return it with aggregator
    # print(f"ETC={etc}")
    return etc, output