def test_get_frequency_map():
    with pytest.raises(AssertionError):
        get_frequency_map("", 1)
    with pytest.raises(AssertionError):
        get_frequency_map("GTACGTACC", 0)
    with pytest.raises(AssertionError):
        get_frequency_map("GTACGTACC", -2)
Exemple #2
0
def find_pattern_clumps(text: str, substring_length: int, window_length: int,
                        minimum_frequency: int):
    """TODO: [summary]

    Returns:
        [type]: [description]
    """
    patterns = set()
    for index in range(len(text) - window_length + 1):
        window = text[index:index + window_length]
        freq_map = get_frequency_map(text=window,
                                     substring_length=substring_length)
        for key, value in freq_map.items():
            if value >= minimum_frequency:
                patterns.add(key)
    return patterns
Exemple #3
0
def find_pattern_clumps(text: str, substring_length: int, window_length: int,
                        minimum_frequency: int) -> Set[str]:
    """Find patterns forming clumps in a `text`, i.e., returns all the substrings of
    length `substring_length` in `text` which occurred at least `minimum_frequency` times
    in a window of fixed length `window_length` along the `text`, essentially looking for
    a region where a k-mer appears several times in short succession

    Returns:
        Set[str]: set of strings
    """
    patterns = set()
    for index in range(len(text) - window_length + 1):
        window = text[index:index + window_length]
        freq_map = get_frequency_map(text=window,
                                     substring_length=substring_length)
        for key, value in freq_map.items():
            if value >= minimum_frequency:
                patterns.add(key)
    return patterns
Exemple #4
0
def find_most_freq_k_substring(text: str,
                               substring_length: int) -> Tuple[List[str], int]:
    """
    Find the most frequent substring of length in a given text
    >>> find_most_freq_k_substring("GTACGTACC", 1)
    (['C'], 3)
    >>> find_most_freq_k_substring("GTACGTACC", 2)
    (['GT', 'TA', 'AC'], 2)
    >>> find_most_freq_k_substring("GTACGTACC", 4)
    (['GTAC'], 2)
    >>> find_most_freq_k_substring("GTACGTACC", 6)
    (['GTACGT', 'TACGTA', 'ACGTAC', 'CGTACC'], 1)
    """
    freq_map = get_frequency_map(text=text, substring_length=substring_length)
    frequency = max(freq_map.values())
    frequent_substrings = [
        key for key, value in freq_map.items() if value == frequency
    ]
    return frequent_substrings, frequency