def cv_match_index_chunk(texts, pattern, chunk_size='m'): """ Performs the cv_match_index algorithm on chunks that are 'chunk_size' long. If the length of the portion of the text that we're sampling is less than the length of the pattern, we pad the end with 0s. Change this if 0s are in the alphabet. This is similar to fftmatch.fft_match_index_n_log_m, but it operates on multiple texts at the same time. Arguments --------- texts : list of str the genomic strings to search pattern : str the pattern that may be contained in multiple locations inside the text chunk_size : type str or int if 'm', it will use the standard algorithm for the n log m algorithm, which breaks the string into 2m size chunks and performs the fft match index algorithm on those chunks if a positive integer, it will break up the string into size 2*chunk_size chunks returns: a list containing the 0-based indices of matches of pattern in text """ if not (chunk_size == 'm' or ((type(chunk_size) == int) and chunk_size > 0)): raise Exception('fft_match_index_n_log_m chunk_size must be str or \ positive integer') n = max(map(len, texts)) m = len(pattern) texts = texts_to_array(texts) pattern = np.array([string_to_binary_array(pattern)])\ .astype(np.float32) start = 0 if chunk_size == 'm': chunk_size = m indices = [np.array([])] * texts.shape[0] while start < n - chunk_size: index = cv_match(texts[:, start:start + chunk_size * 2], pattern) for i in range(len(indices)): if index[i].shape > 0: indices[i] = np.append(indices[i], start + index[i]) start += chunk_size out = [[]] * texts.shape[0] for i in range(len(out)): out[i] = np.unique(indices[i]).astype(int) return np.array(out)
def cv_match_index_chunk(texts, pattern, chunk_size='m'): """ Performs the cv_match_index algorithm on chunks that are 'chunk_size' long. If the length of the portion of the text that we're sampling is less than the length of the pattern, we pad the end with 0s. Change this if 0s are in the alphabet. This is similar to fftmatch.fft_match_index_n_log_m, but it operates on multiple texts at the same time. Arguments --------- texts : list of str the genomic strings to search pattern : str the pattern that may be contained in multiple locations inside the text chunk_size : type str or int if 'm', it will use the standard algorithm for the n log m algorithm, which breaks the string into 2m size chunks and performs the fft match index algorithm on those chunks if a positive integer, it will break up the string into size 2*chunk_size chunks returns: a list containing the 0-based indices of matches of pattern in text """ if not (chunk_size == 'm' or ((type(chunk_size) == int) and chunk_size>0)): raise Exception('fft_match_index_n_log_m chunk_size must be str or \ positive integer') n = max(map(len, texts)) m = len(pattern) texts = texts_to_array(texts) pattern = np.array([string_to_binary_array(pattern)])\ .astype(np.float32) start = 0 if chunk_size == 'm': chunk_size = m indices = [np.array([])] * texts.shape[0] while start < n-chunk_size: index = cv_match(texts[:,start:start+chunk_size*2], pattern) for i in range(len(indices)): if index[i].shape > 0: indices[i] = np.append(indices[i], start+index[i]) start += chunk_size out = [[]]*texts.shape[0] for i in range(len(out)): out[i] = np.unique(indices[i]).astype(int) return np.array(out)
def cv_match_index(texts, pattern): """ This method uses Open CV's template matching algorithm to do substring matching inside of len(texts) genome strings for the specified pattern """ texts = texts_to_array(texts) pattern = np.array([string_to_binary_array(pattern)])\ .astype(np.float32) return cv_match(texts, pattern)
def texts_to_array(texts): """ Converts texts into an array of floats of their ascii representation Arguments --------- texts : list of str texts has k rows, and the maximum length string is length N Returns ------- arr : numpy array k X N array with the float ascii representation of all of the texts """ n = max(map(len, texts)) out = np.ndarray((len(texts), n)) for index, row in enumerate(texts): out[index, 0:len(row)] = string_to_binary_array(row) return out.astype(np.float32)
def texts_to_array(texts): """ Converts texts into an array of floats of their ascii representation Arguments --------- texts : list of str texts has k rows, and the maximum length string is length N Returns ------- arr : numpy array k X N array with the float ascii representation of all of the texts """ n = max(map(len, texts)) out = np.ndarray((len(texts), n)) for index, row in enumerate(texts): out[index,0:len(row)] = string_to_binary_array(row) return out.astype(np.float32)