Python string_to_binary_array Examples, fftmatch.string_to_binary_array Python Examples

Example #1

0

Show file

File: cvmatch.py Project: samanthasiow/639-final-project

def cv_match_index_chunk(texts, pattern, chunk_size='m'):
    """
    Performs the cv_match_index algorithm on chunks that are 'chunk_size' long.
    If the length of the portion of the text that we're sampling is less than 
    the length of the pattern, we pad the end with 0s. Change this if 0s are in 
    the alphabet.

    This is similar to fftmatch.fft_match_index_n_log_m, but it operates on
    multiple texts at the same time.


    Arguments
    ---------
    texts : list of str
        the genomic strings to search
    pattern : str 
        the pattern that may be contained in multiple locations inside the text
    chunk_size : type str or int
        if 'm', it will use the standard algorithm for the n log m algorithm,
            which breaks the string into 2m size chunks and performs the
            fft match index algorithm on those chunks
        if a positive integer, it will break up the string into size 
            2*chunk_size chunks

    returns: a list containing the 0-based indices of matches of pattern in text
    """
    if not (chunk_size == 'm' or
            ((type(chunk_size) == int) and chunk_size > 0)):
        raise Exception('fft_match_index_n_log_m chunk_size must be str or \
positive integer')
    n = max(map(len, texts))

    m = len(pattern)

    texts = texts_to_array(texts)

    pattern = np.array([string_to_binary_array(pattern)])\
        .astype(np.float32)

    start = 0

    if chunk_size == 'm':
        chunk_size = m

    indices = [np.array([])] * texts.shape[0]
    while start < n - chunk_size:
        index = cv_match(texts[:, start:start + chunk_size * 2], pattern)
        for i in range(len(indices)):
            if index[i].shape > 0:
                indices[i] = np.append(indices[i], start + index[i])

        start += chunk_size

    out = [[]] * texts.shape[0]
    for i in range(len(out)):
        out[i] = np.unique(indices[i]).astype(int)

    return np.array(out)

Example #2

0

Show file

File: cvmatch.py Project: imichaelnorris/639-final-project

def cv_match_index_chunk(texts, pattern, chunk_size='m'):
    """
    Performs the cv_match_index algorithm on chunks that are 'chunk_size' long.
    If the length of the portion of the text that we're sampling is less than 
    the length of the pattern, we pad the end with 0s. Change this if 0s are in 
    the alphabet.

    This is similar to fftmatch.fft_match_index_n_log_m, but it operates on
    multiple texts at the same time.


    Arguments
    ---------
    texts : list of str
        the genomic strings to search
    pattern : str 
        the pattern that may be contained in multiple locations inside the text
    chunk_size : type str or int
        if 'm', it will use the standard algorithm for the n log m algorithm,
            which breaks the string into 2m size chunks and performs the
            fft match index algorithm on those chunks
        if a positive integer, it will break up the string into size 
            2*chunk_size chunks

    returns: a list containing the 0-based indices of matches of pattern in text
    """
    if not (chunk_size == 'm' or ((type(chunk_size) == int) and chunk_size>0)):
        raise Exception('fft_match_index_n_log_m chunk_size must be str or \
positive integer')
    n = max(map(len, texts))

    m = len(pattern)

    texts = texts_to_array(texts)

    pattern = np.array([string_to_binary_array(pattern)])\
        .astype(np.float32)

    start = 0

    if chunk_size == 'm':
        chunk_size = m

    indices = [np.array([])] * texts.shape[0]
    while start < n-chunk_size:
        index = cv_match(texts[:,start:start+chunk_size*2], pattern)
        for i in range(len(indices)):
            if index[i].shape > 0:
                indices[i] = np.append(indices[i], start+index[i])

        start += chunk_size

    out = [[]]*texts.shape[0]
    for i in range(len(out)):
        out[i] = np.unique(indices[i]).astype(int)

    return np.array(out)

Example #3

0

Show file

File: cvmatch.py Project: samanthasiow/639-final-project

def cv_match_index(texts, pattern):
    """
    This method uses Open CV's template matching algorithm to do substring
    matching inside of len(texts) genome strings for the specified pattern
    """

    texts = texts_to_array(texts)
    pattern = np.array([string_to_binary_array(pattern)])\
        .astype(np.float32)

    return cv_match(texts, pattern)

Example #4

0

Show file

File: cvmatch.py Project: imichaelnorris/639-final-project

def cv_match_index(texts, pattern):
    """
    This method uses Open CV's template matching algorithm to do substring
    matching inside of len(texts) genome strings for the specified pattern
    """

    texts = texts_to_array(texts)
    pattern = np.array([string_to_binary_array(pattern)])\
        .astype(np.float32)

    return cv_match(texts, pattern)

Example #5

0

Show file

File: cvmatch.py Project: samanthasiow/639-final-project

def texts_to_array(texts):
    """
    Converts texts into an array of floats of their ascii representation

    Arguments
    ---------
    texts : list of str
        texts has k rows, and the maximum length string is length N

    Returns
    -------
    arr : numpy array
        k X N array with the float ascii representation of all of the texts
    """
    n = max(map(len, texts))
    out = np.ndarray((len(texts), n))
    for index, row in enumerate(texts):
        out[index, 0:len(row)] = string_to_binary_array(row)

    return out.astype(np.float32)

Example #6

0

Show file

File: cvmatch.py Project: imichaelnorris/639-final-project

def texts_to_array(texts):
    """
    Converts texts into an array of floats of their ascii representation

    Arguments
    ---------
    texts : list of str
        texts has k rows, and the maximum length string is length N

    Returns
    -------
    arr : numpy array
        k X N array with the float ascii representation of all of the texts
    """
    n = max(map(len, texts))
    out = np.ndarray((len(texts), n))
    for index, row in enumerate(texts):
        out[index,0:len(row)] = string_to_binary_array(row)

    return out.astype(np.float32)