def vectorize(questions, answers, chars=None): """Vectorize the questions and expected answers""" print('Vectorization...') chars = chars or CHARS x_maxlen = max(len(question) for question in questions) y_maxlen = max(len(answer) for answer in answers) # print (len(questions), x_maxlen, len(chars)) len_of_questions = len(questions) ctable = CharacterTable(chars) print("X = np_zeros") X = np_zeros((len_of_questions, x_maxlen, len(chars)), dtype=np.bool) print("for i, sentence in enumerate(questions):") for i in xrange(len(questions)): sentence = questions.pop() for j, c in enumerate(sentence): X[i, j, ctable.char_indices[c]] = 1 print("y = np_zeros") y = np_zeros((len_of_questions, y_maxlen, len(chars)), dtype=np.bool) print("for i, sentence in enumerate(answers):") for i in xrange(len(answers)): sentence = answers.pop() for j, c in enumerate(sentence): y[i, j, ctable.char_indices[c]] = 1 # Explicitly set apart 10% for validation data that we never train over split_at = int(len(X) - len(X) / 10) (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at)) (y_train, y_val) = (y[:split_at], y[split_at:]) print(X_train.shape) print(y_train.shape) return X_train, X_val, y_train, y_val, y_maxlen, ctable
def get_fit(self, fit_type, data, order, smooth, degree, begin, end, weight=None): z1 = np_zeros(begin) z2 = np_zeros(len(data[0]) - end) if end == 0: end = None x = data[0][begin:end] y = data[1][begin:end] if weight is not None: weight = weight[begin:end] if fit_type == "spline": f = inter.UnivariateSpline(x, y, w=weight, k=order, s=smooth) else: f = poly.Chebyshev.fit(x, y, degree, w=weight) res = y - f(x) nfit = f(x) / np_max(f(x)) corr = concatenate((z1, nfit, z2)) fitc = [x, f(x), corr, res] return fitc
def pca(self, data_matrix): """Perform PCA. Principal components are given in self.pca, and the variance in self.variance. Parameters ---------- data_matrix : list of lists List of tetranucleotide signatures """ cols = len(data_matrix[0]) data_matrix = np_reshape(np_array(data_matrix), (len(data_matrix), cols)) pca = PCA() pc, variance = pca.pca_matrix(data_matrix, 3, bCenter=True, bScale=False) # ensure pc matrix has at least 3 dimensions if pc.shape[1] == 1: pc = np_append(pc, np_zeros((pc.shape[0], 2)), 1) variance = np_append(variance[0], np_ones(2)) elif pc.shape[1] == 2: pc = np_append(pc, np_zeros((pc.shape[0], 1)), 1) variance = np_append(variance[0:2], np_ones(1)) return pc, variance
def non_iter_ls_inv_stft(stft_object): stft_data = stft_object['stft'] origSigSize = stft_object['origSigSize'] num_rows, _, _ = origSigSize shift_length = stft_object['shift_length'] len_each_section, num_rows_overlap, _, _ = stft_data.shape # TODO: Isn't this just num_rows in the very beginning? # total_new_elements = (num_rows_overlap - 1) * shift_length + len_each_section win_info = stft_object['win_info'] wVec = win_info(len_each_section) wVecSq = wVec**2 vecC = np_arange(1, num_rows_overlap * shift_length, step=shift_length) # vecC = range(0, num_rows_overlap*shift_length-1, shift_length) DlsArr = np_zeros((num_rows, )) for j in vecC: tmpArr = np_arange(j - 1, len_each_section + j - 1) # tmpArr = np_arange(j, len_each_section+j) DlsArr[tmpArr] += wVecSq # DlsArrInv = 1/DlsArr invFT = math_sqrt(len_each_section) * np_ifft(stft_data, axis=0) invFT_real = invFT.real invFT *= wVec[:, np_newaxis, np_newaxis, np_newaxis] yEst = np_zeros(origSigSize) for index, j in enumerate(vecC): tmpArr = np_arange(j - 1, len_each_section + j - 1) yEst[tmpArr, :] += invFT_real[:, index, :] # sigOut = yEst * DlsArrInv[:, np_newaxis, np_newaxis] sigOut = yEst / DlsArr[:, np_newaxis, np_newaxis] return sigOut
def vectorize(questions, answers, chars=None): """Vectorize the questions and expected answers""" print('Vectorization...') chars = chars or CHARS x_maxlen = max(len(question) for question in questions) y_maxlen = max(len(answer) for answer in answers) # print (len(questions), x_maxlen, len(chars)) len_of_questions = len(questions) ctable = CharacterTable(chars) print("X = np_zeros") X = np_zeros((len_of_questions, x_maxlen, len(chars)), dtype=np.bool) print("for i, sentence in enumerate(questions):") for i in xrange(len(questions)): sentence = questions.pop() for j, c in enumerate(sentence): X[i, j, ctable.char_indices[c]] = 1 print("y = np_zeros") y = np_zeros((len_of_questions, y_maxlen, len(chars)), dtype=np.bool) print("for i, sentence in enumerate(answers):") for i in xrange(len(answers)): sentence = answers.pop() for j, c in enumerate(sentence): y[i, j, ctable.char_indices[c]] = 1 # Explicitly set apart 10% for validation data that we never train over split_at = len(X) - len(X) / 10 (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at)) (y_train, y_val) = (y[:split_at], y[split_at:]) print(X_train.shape) print(y_train.shape) return X_train, X_val, y_train, y_val, y_maxlen, ctable
def wlcs(self, src, tar): """Return the Rouge-W weighted longest common sub-sequence length. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int (may return a float if cost has float values) The Levenshtein distance between src & tar Examples -------- >>> cmp = RougeW() >>> cmp.wlcs('cat', 'hat') 4 >>> cmp.wlcs('Niall', 'Neil') 3 >>> cmp.wlcs('aluminum', 'Catalan') 5 >>> cmp.wlcs('ATCG', 'TAGC') 3 .. versionadded:: 0.4.0 """ src_len = len(src) tar_len = len(tar) if src == tar: return self._f_func(len(src)) if not src: return 0 if not tar: return 0 c_mat = np_zeros((src_len, tar_len), dtype=np_int) w_mat = np_zeros((src_len, tar_len), dtype=np_int) for i in range(src_len): for j in range(tar_len): if src[i] == tar[j]: k = w_mat[i - 1, j - 1] c_mat[i, j] = (c_mat[i - 1, j - 1] + self._f_func(k + 1) - self._f_func(k)) w_mat[i, j] = k + 1 else: if c_mat[i - 1, j] > c_mat[i, j - 1]: c_mat[i, j] = c_mat[i - 1, j] w_mat[i, j] = 0 else: c_mat[i, j] = c_mat[i, j - 1] w_mat[i, j] = 0 return c_mat[src_len - 1, tar_len - 1]
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): """Return the Gotoh score of two strings. The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with affine gap penalties. :param str src: source string for comparison :param str tar: target string for comparison :param float gap_open: the cost of an open alignment gap (1 by default) :param float gap_ext: the cost of an alignment gap extension (0.4 by default) :param function sim_func: a function that returns the similarity of two characters (identity similarity by default) :returns: Gotoh score :rtype: float >>> gotoh('cat', 'hat') 2.0 >>> gotoh('Niall', 'Neil') 1.0 >>> round(gotoh('aluminum', 'Catalan'), 12) -0.4 >>> gotoh('cat', 'hat') 2.0 """ d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) q_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) d_mat[0, 0] = 0 p_mat[0, 0] = float('-inf') q_mat[0, 0] = float('-inf') for i in range(1, len(src) + 1): d_mat[i, 0] = float('-inf') p_mat[i, 0] = -gap_open - gap_ext * (i - 1) q_mat[i, 0] = float('-inf') q_mat[i, 1] = -gap_open for j in range(1, len(tar) + 1): d_mat[0, j] = float('-inf') p_mat[0, j] = float('-inf') p_mat[1, j] = -gap_open q_mat[0, j] = -gap_open - gap_ext * (j - 1) for i in range(1, len(src) + 1): for j in range(1, len(tar) + 1): sim_val = sim_func(src[i - 1], tar[j - 1]) d_mat[i, j] = max(d_mat[i - 1, j - 1] + sim_val, p_mat[i - 1, j - 1] + sim_val, q_mat[i - 1, j - 1] + sim_val) p_mat[i, j] = max(d_mat[i - 1, j] - gap_open, p_mat[i - 1, j] - gap_ext) q_mat[i, j] = max(d_mat[i, j - 1] - gap_open, q_mat[i, j - 1] - gap_ext) i, j = (n - 1 for n in d_mat.shape) return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
def __init_statistics(self): stats = self.raw_stats if stats is not None: combined = np_array([[int(team), stats['oprs'][team], stats['dprs'][team], stats['ccwms'][team]] for team in stats['oprs'].keys()], np_object) else: teams = self.get_team()[:, 0] num_teams = len(teams) combined = np_rot90( np_array([teams, np_zeros(num_teams), np_zeros(num_teams), np_zeros(num_teams)], np_object))[::-1] self.stats = combined
def __init_matches(self): for match_type, var in [['qm', 'qualification_matches'], ['qf', 'quarter_final_matches'], ['sf', 'semi_final_matches'], ['f', 'final_matches']]: num_matches = self.__count_matches(self.raw_matches, match_type) if num_matches is not 0: # zero = range(num_matches) red_teams = np_zeros((num_matches,), np_object) blue_teams = np_zeros((num_matches,), np_object) blue_scores = np_zeros((num_matches,), np_object) red_scores = np_zeros((num_matches,), np_object) match_code = np_zeros((num_matches,), np_object) match_numbers = np_arange(1, num_matches + 1, 1) for match in self.raw_matches: if match['comp_level'] == match_type: match_num = match['match_number'] - 1 red_teams[match_num] = [np_int(match['alliances']['red']['teams'][0][3:]), np_int(match['alliances']['red']['teams'][1][3:]), np_int(match['alliances']['red']['teams'][2][3:])] red_scores[match_num] = [-1 if match['alliances']['red']['score'] is None else match['alliances']['red']['score'], -1 if match['score_breakdown']['red']['auto'] is None else match['score_breakdown']['red']['auto'], -1 if match['score_breakdown']['red']['foul'] is None else match['score_breakdown']['red']['foul']] blue_teams[match_num] = [np_int(match['alliances']['blue']['teams'][0][3:]), np_int(match['alliances']['blue']['teams'][1][3:]), np_int(match['alliances']['blue']['teams'][2][3:])] blue_scores[match_num] = [-1 if match['alliances']['blue']['score'] is None else match['alliances']['blue']['score'], -1 if match['score_breakdown']['blue']['auto'] is None else match['score_breakdown']['blue']['auto'], -1 if match['score_breakdown']['blue']['foul'] is None else match['score_breakdown']['blue']['foul']] match_code[match_num] = match['key'] red_win = np_array(red_scores.tolist())[:, 0] > np_array(blue_scores.tolist())[:, 0] winner = np_array(['blue'] * len(red_win)) winner[red_win] = 'red' self.__setattr__(var, np_rot90(np_array([[match_type] * num_matches, match_numbers, red_teams, blue_teams, red_scores, blue_scores, winner, match_code], np_object))[::-1])
def updateScore(csvfile, score): """ Add or update score column and reorder """ import string head, rows = read_csv(csvfile) data = pd_read_csv(csvfile) data.index = data.index + 1 cols = data.columns.tolist() sco = pd_Series(np_zeros(len(data[cols[0]])), index=data.index) if 'Score' not in cols: data['Score'] = sco cols = ['Score'] + cols data = data[cols] colk = list(string.ascii_uppercase) for sc in score: try: coln = colk.index(sc[0]) val = sc[2] checked = sc[3] if checked: sco += val * data.iloc[:, coln] except: continue data['Score'] = sco data = data.sort_values('Score', ascending=False) updateMSA(os_path.dirname(csvfile), [[v] for v in data['Seq. ID']]) data = data.reset_index(drop=True) data.index = data.index + 1 data.rename_axis('Select', axis="columns") data.to_csv(csvfile, quoting=csv_QUOTE_ALL, index=False) return data
def parse_matrix_part(matrix, szSub, ovSub): assert matrix.ndim == 3 assert np_ndim(szSub) == 1 assert len(szSub) == 3 assert np_ndim(ovSub) == 1 assert len(ovSub) == 3 matrix_shape = np_asarray(matrix.shape, dtype=int) len_each_section, _, _ = szSub shift_length, _, _ = ovSub len_each_section_range = np_arange(len_each_section) matrix_shape = np_ceil((matrix_shape - szSub + 1)/ovSub).astype(int) num_rows_overlap, num_elements, num_beams = matrix_shape result_matrix = np_zeros((np_prod(szSub), np_prod(matrix_shape))) cnt = 0 for i in range(num_beams): for j in range(num_elements): for k in range(num_rows_overlap): index_1 = len_each_section_range + k * shift_length index_2 = j index_3 = i tmp = matrix[index_1, index_2, index_3] result_matrix[:, cnt] = tmp cnt += 1 return result_matrix
def GHZ_state(n): r"""生成一个 GHZ-state 的 numpy 形式。 Args: n (int): 量子比特数量 Returns: numpy.ndarray: 一个形状为 ``(1, 2**n)`` 的 numpy 数组 代码示例: .. code-block:: python from paddle_quantum.state import GHZ_state vector = GHZ_state(3) print(vector) :: [[0.70710678+0.j 0. +0.j 0. +0.j 0. +0.j 0. +0.j 0. +0.j 0. +0.j 0.70710678+0.j]] """ assert n > 2, 'qubit number must be larger than 2' state = np_zeros((1, 2**n)) state[0][0] = 1 / np.sqrt(2) state[0][-1] = 1 / np.sqrt(2) return state.astype("complex128")
def _lcsstr_stl(src, tar): """Return start positions & length for Ratcliff-Obershelp. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- tuple The start position in the source string, start position in the target string, and length of the longest common substring of strings src and tar. .. versionadded:: 0.1.0 """ lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) longest, src_longest, tar_longest = 0, 0, 0 for i in range(1, len(src) + 1): for j in range(1, len(tar) + 1): if src[i - 1] == tar[j - 1]: lengths[i, j] = lengths[i - 1, j - 1] + 1 if lengths[i, j] > longest: longest = lengths[i, j] src_longest = i tar_longest = j else: lengths[i, j] = 0 return src_longest - longest, tar_longest - longest, longest
def get_features(self, student_ID, semester): """ """ abs_df = self._academic_clusterer.courses_features tmp_df = abs_df[ abs_df[self._academic_clusterer.course_attr].isin(semester) ] tse_df = self._academic_clusterer.semesters_features tse_df = tse_df[ tse_df[self._academic_clusterer.studentId_attr]==student_ID ] if tse_df.empty: semester_lvl = 1 else: semester_lvl = tse_df[self._academic_clusterer.SEMESTERS_F_LABELS[0]].values.max() + 1 alpha = tmp_df['alpha'].values.sum() beta = tmp_df['beta'].values.sum() skewness = tmp_df['skewness'].values.sum() n_courses = len( semester ) semester_features = (semester_lvl, alpha, beta, skewness, n_courses) print(semester_features) cs_df = self._academic_clusterer.students_features cs_df = cs_df[ cs_df[self._academic_clusterer.studentId_attr] == student_ID ] if cs_df.empty: student_features = np_zeros((1,5)) else: student_features = cs_df[ self._academic_clusterer.STUDENTS_F_LABELS ].values return semester_features, student_features
def noise_dwt(cls, coeff, w): """Return the estimation of the DWT components noise level coeff: DWT coefficients w: pywt wavelet object """ n_boot = 1000 k_th = 10 k_std = 1. / np_sqrt(2) std_l = [] std_a = np_zeros(n_boot) wcomp = cls.wavecomp(coeff, w, len(coeff) - 1) for ii in xrange(n_boot): std_a[ii] = np_std(bootstrap_resample(wcomp, 10)) stdv = np_median(std_a) std_l.append(stdv) for ll in xrange(len(coeff) - 2, 0, -1): stdv = stdv * k_std std_l.append(stdv) std_l.append(0) std_l.reverse() return np_array(std_l) * k_th
def w_state(n, coeff=None): r"""生成一个 W-state 的 numpy 形式。 Args: n (int): 量子比特数量 coeff (numpy.ndarray, optional): 默认为 ``None`` ,即生成平均概率幅(系数) Returns: numpy.ndarray: 一个形状为 ``(1, 2**n)`` 的 numpy 数组 代码示例: .. code-block:: python from paddle_quantum.state import w_state vector = w_state(3) print(vector) :: [[0. +0.j 0.57735027+0.j 0.57735027+0.j 0. +0.j 0.57735027+0.j 0. +0.j 0. +0.j 0. +0.j]] """ assert n > 0, 'qubit number must be larger than 1' c = coeff if coeff is not None else np.ones((1, 2**n)) / np.sqrt(n) assert c.shape[0] == 1 and c.shape[ 1] == 2**n, 'The dimension of coeff is not right' state = np_zeros((1, 2**n)) for i in range(n): state[0][2**i] = c[0][n - i - 1] return state.astype("complex128")
def density_op(n): r"""生成密度矩阵 :math:`|00..0\rangle \langle00..0|` 的 numpy 形式。 Args: n (int): 量子比特数量 Returns: numpy.ndarray: 一个形状为 ``(2**n, 2**n)`` 的 numpy 数组 代码示例: .. code-block:: python from paddle_quantum.state import density_op state = density_op(2) print(state) :: [[1.+0.j 0.+0.j 0.+0.j 0.+0.j] [0.+0.j 0.+0.j 0.+0.j 0.+0.j] [0.+0.j 0.+0.j 0.+0.j 0.+0.j] [0.+0.j 0.+0.j 0.+0.j 0.+0.j]] """ assert n > 0, 'qubit number must be positive' rho = np_zeros((2**n, 2**n)) rho[0, 0] = 1 return rho.astype("complex128")
def get_captions(self, ix, necessary_num_img_captions): # # Fetch the sequence labels # NOTE: 1-indexed, not 0-indexed first_caption_idx = self.label_start_ix[ ix] - 1 # label_start_ix starts from 1 last_caption_idx = self.label_end_ix[ix] - 1 num_img_captions = last_caption_idx - first_caption_idx + 1 assert num_img_captions > 0, f"Image {ix} has no caption. Aborting!" # # If we require more captions per image for training # than are available, we sample with replacement. if num_img_captions < necessary_num_img_captions: # seq = np_zeros([necessary_num_img_captions, self.max_seq_length], dtype="int") for q in range(necessary_num_img_captions): ixl = randint(first_caption_idx, last_caption_idx) seq[q, :] = self.label[ixl, :self.max_seq_length] else: # # Unnecessary to choose the captions sequentially. Come back to this later... ixl = randint(first_caption_idx, last_caption_idx - necessary_num_img_captions + 1) seq = self.label[ixl:ixl + necessary_num_img_captions, :self.max_seq_length] return seq
def index_data(self, new_sequences: np_ndarray): """ The Index_Data function allows you to insert a large number of sequences :param numpy.ndarray new_sequences: The sequences to be inserted :returns: The number of sequences (sub sequences) insert into the tree (in the trees) :rtype: numpy.array """ # Ts Conversion to PAA if new_sequences.shape[-1] > 1: # add dim to avoid tslearn warning new_sequences = new_sequences.reshape(new_sequences.shape + (1, )) npaa = self._paa.fit_transform(new_sequences) # To count the number of objects in each tree cmpt_insert = np_zeros(shape=self.number_tree) for i, tree in self.forest.items(): # Retrieves the indices of the tree, in the multi-tree case npaa_tmp = npaa[:, self.indices_partition[i]] npaa_tmp = npaa_tmp.reshape(npaa_tmp.shape[:-1]) for npa_tp in npaa_tmp: tree.insert_paa(npa_tp) cmpt_insert[i] += 1 # Returns array[tree_index] with the number of inserted objects for each tree return cmpt_insert
def number_nodes_visited(self, sub_query: np_array, ntss_tmp: np_ndarray): """ Account the number of average visited nodes in the tree for calculating the approximation. :param numpy.array sub_query: The sequence to be evaluated :param numpy.ndarray ntss_tmp: Reference sequences :returns: Returns the number of nodes visited in the tree for the approximation *i*\ CFOF :rtype: numpy.array """ q_paa = self.isax.transform_paa([sub_query])[0] ntss_tmp_paa = self.isax.transform_paa(ntss_tmp) distance_q_p = cdist([q_paa.reshape(q_paa.shape[:-1])], ntss_tmp_paa.reshape(ntss_tmp_paa.shape[:-1]))[0] list_parent_node = np_zeros(len(self.node_list), dtype=np_uint32) for tmp_node in self.node_list: if tmp_node.id_numpy == 0: continue list_parent_node[tmp_node.id_numpy] = tmp_node.parent.id_numpy count_visited_nodes_list = nodes_visited_for_all_seq_ref( len(ntss_tmp_paa), distance_q_p, self.max_array, self.min_array, list_parent_node) return self.num_nodes, count_visited_nodes_list.mean()
def populateImageMaps(self): """Load the transformed data into the main image maps""" # reset these guys... JIC self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor)) self.im2RowIndicies = {} # add to the grid wherever we find a contig row_index = -1 for point in np_around(self.PM.transformedCP): row_index += 1 # can only bin things once! if row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies: # add to the row_index dict so we can relate the # map back to individual points later p = tuple(point) if p in self.im2RowIndicies: self.im2RowIndicies[p].append(row_index) else: self.im2RowIndicies[p] = [row_index] # now increment in the grid # for each point we encounter we incrmement # it's position + the positions to each side # and touching each corner self.incrementViaRowIndex(row_index, p)
def dist_abs(self, src, tar): """Return the FlexMetric distance of two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float FlexMetric distance Examples -------- >>> cmp = FlexMetric() >>> cmp.dist_abs('cat', 'hat') 0.8 >>> cmp.dist_abs('Niall', 'Neil') 1.5 >>> cmp.dist_abs('aluminum', 'Catalan') 6.7 >>> cmp.dist_abs('ATCG', 'TAGC') 2.1999999999999997 .. versionadded:: 0.4.0 """ src_len = len(src) tar_len = len(tar) if src == tar: return 0 if not src: return sum(self._cost('', -1, tar, j) for j in range(len(tar))) if not tar: return sum(self._cost(src, i, '', -1) for i in range(len(src))) d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) for i in range(1, src_len + 1): d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(src, i - 1, '', -1) for j in range(1, tar_len + 1): d_mat[0, j] = d_mat[0, j - 1] + self._cost('', -1, tar, j - 1) src_lc = src.lower() tar_lc = tar.lower() for i in range(src_len): for j in range(tar_len): d_mat[i + 1, j + 1] = min( d_mat[i + 1, j] + self._cost('', -1, tar_lc, j), # ins d_mat[i, j + 1] + self._cost(src_lc, i, '', -1), # del d_mat[i, j] + (self._cost(src_lc, i, tar_lc, j) if src[i] != tar[j] else 0), # sub/== ) return d_mat[src_len, tar_len]
def optimize_one_day( i: int, op_ct_func: Callable, pw_mat: np_ndarray, o_cub: np_ndarray, rsk_tgt_mat: np_ndarray, rsk_msk: np_ndarray, hg_tools: Tuple = (-1, )) -> np_ndarray: port_mat = o_cub[:, i, :] pw_ar = pw_mat[:, i, :] rsk_tgt = rsk_tgt_mat[i, :] hg_tools = list(hg_tools) rsk_msk = list(rsk_msk) pw_ar = pw_ar.reshape(-1) def _op_tgt(w: np_ndarray) -> float: tw = pw_ar.copy() tw[hg_tools] = w return op_ct_func(port_mat.T @ tw, rsk_tgt=rsk_tgt, rsk_msk=rsk_msk) w_0 = np_rd.randn(len(hg_tools)) tr = np_zeros(port_mat.shape[0], dtype=float) tr[hg_tools] = minimize(_op_tgt, w_0, method="nelder-mead").x return tr
def node_comparison_prec_recall(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp, p, out_comp_nm): N_matches_test = 0 Metric = np_zeros((N_test_comp, N_pred_comp)) for i, test_complex in enumerate(known_complex_nodes_list): N_match_pred = 0 for j, pred_complex in enumerate(fin_list_graphs): T = set(test_complex) P = pred_complex[0] C = len(T.intersection(P)) A = len(P.difference(T)) B = len(T.difference(P)) if float(C) / (A + C) > p and float(C) / (B + C) > p: Metric[i, j] = 1 N_match_pred = N_match_pred + 1 if N_match_pred > 0: N_matches_test = N_matches_test + 1 plot_pr_curve_orig(Metric, fin_list_graphs, out_comp_nm) Recall = float(N_matches_test) / N_test_comp N_matches_pred = np_count_nonzero(np_sum(Metric, axis=0)) Precision = float(N_matches_pred) / N_pred_comp if Precision == Recall == 0: F1_score = 0 else: F1_score = 2 * Precision * Recall / (Precision + Recall) return Precision, Recall, F1_score
def score_by_listvrang(k_list_result, k_rho): """ CFOF approximations computation from the vrang list and according to the value of :math:`\\varrho` contained in the ``k_rho`` list. CFOF approximation obtained for each :math:`\\varrho` contained in the list``k_rho``. :param list(float) k_list_result: The list of vrang of the sequence to be evaluated :param list(float) k_rho: The list of :math:`\\varrho` for CFOF score approximations computation. :returns: The list of CFOF score approximations :rtype: list(float) """ nb_obj_total = len(k_list_result) score_list = np_zeros(len(k_rho)) need_nn_prec = 0 for k_rho_ite, k_rho_var in enumerate(k_rho): need_nn = k_rho_var - need_nn_prec while need_nn > 0: need_nn -= 1 estim_final = k_list_result.pop(0) score_list[k_rho_ite] = estim_final / nb_obj_total need_nn_prec = k_rho_var return score_list
def _plot(self, x, y=None): cr = True if y == None: cr = False y = x length = x.shape[0] rplot = np_zeros((length, length)) if cr: np_fill_diagonal(rplot, self.norm.compute(x, y)) if self.norm.is_simmetric: for lag in xrange(1, length): d = self.norm.compute(x[0:-lag], y[lag:]) np_fill_diagonal(rplot[lag:, 0:-lag], d) np_fill_diagonal(rplot[0:-lag, lag:], d) # rplot = np_rot90(rplot) else: pass return rplot
def set_data(self): self.x_mat = np_zeros((self.col_mat.shape[1], self.col_mat.shape[0])) self.x_mat[:, :] = self.x self.tex = pg_makeRGBA(np_rot90(self.col_mat), levels=(50., 255.))[0] / 255. self.tex[..., 3] = self.tex[..., 0]
def vrang_list_for_all_seq_ref(len_seq_list, distance, max_array, min_array, cdf_mean, cdf_std, num_ts_by_node, index_cdf_bin, cdf_bins): """ Uses the function :func:`~pyCFOFiSAX.tree_iSAX.vrang_seq_ref` For each reference sequence. :param float len_seq_list: The number of reference sequence :param np_array distance: The distance between the two sequences :param np_ndarray max_array: Max distances between the nodes of the tree and the reference sequence :param np_ndarray min_array: MIN distances between the nodes of the tree and the reference sequence :param np_ndarray cdf_mean: The average distances between the nodes of the tree and the reference sequence :param np_array cdf_std: Dispersion of distances in each leaf node :param np_array num_ts_by_node: The number of sequence in each node sheet :param np_array index_cdf_bin: The index of the CDF ``cdf_bins`` :param np_array cdf_bins: Normal distribution cdf values centered at the origin and standard deviation :returns: la liste des vrang :rtype: np_array """ vrang_array = np_zeros(len_seq_list) for ii_tmp in prange(len_seq_list): vrang_array[ii_tmp] = vrang_seq_ref( distance[ii_tmp], max_array[ii_tmp], min_array[ii_tmp], cdf_mean[ii_tmp], cdf_std, num_ts_by_node, index_cdf_bin, cdf_bins) return vrang_array
def lcsseq(self, src, tar): """Return the longest common subsequence of two strings. Based on the dynamic programming algorithm from http://rosettacode.org/wiki/Longest_common_subsequence :cite:`rosettacode:2018b`. This is licensed GFDL 1.2. Modifications include: conversion to a numpy array in place of a list of lists Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- str The longest common subsequence Examples -------- >>> sseq = LCSseq() >>> sseq.lcsseq('cat', 'hat') 'at' >>> sseq.lcsseq('Niall', 'Neil') 'Nil' >>> sseq.lcsseq('aluminum', 'Catalan') 'aln' >>> sseq.lcsseq('ATCG', 'TAGC') 'AC' """ lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) # row 0 and column 0 are initialized to 0 already for i, src_char in enumerate(src): for j, tar_char in enumerate(tar): if src_char == tar_char: lengths[i + 1, j + 1] = lengths[i, j] + 1 else: lengths[i + 1, j + 1] = max(lengths[i + 1, j], lengths[i, j + 1]) # read the substring out from the matrix result = '' i, j = len(src), len(tar) while i != 0 and j != 0: if lengths[i, j] == lengths[i - 1, j]: i -= 1 elif lengths[i, j] == lengths[i, j - 1]: j -= 1 else: result = src[i - 1] + result i -= 1 j -= 1 return result
def sim(self, src, tar): """Return the BI-SIM similarity of two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float BI-SIM similarity Examples -------- >>> cmp = BISIM() >>> cmp.sim('cat', 'hat') 0.5 >>> cmp.sim('Niall', 'Neil') 0.4 >>> cmp.sim('aluminum', 'Catalan') 0.3125 >>> cmp.sim('ATCG', 'TAGC') 0.375 .. versionadded:: 0.4.0 """ src_len = len(src) tar_len = len(tar) if src == tar: return 1.0 if not src or not tar: return 0.0 def _id(src_pos, tar_pos): s = 0 for i in range(self._qval): s += int(src[src_pos + i] == tar[tar_pos + i]) return s / self._qval src = src[0].swapcase() * (self._qval - 1) + src tar = tar[0].swapcase() * (self._qval - 1) + tar d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) for i in range(1, src_len + 1): for j in range(1, tar_len + 1): d_mat[i, j] = max( d_mat[i - 1, j - 1] + _id(i - 1, j - 1), # sub/== d_mat[i - 1, j], # ins d_mat[i, j - 1], # del ) return d_mat[src_len, tar_len] / max(src_len, tar_len)
def __getitem__(self, index): """This function returns a tuple that is further passed to collate_fn """ ix, it_pos_now, wrapped = index # self.split_ix[index] if self.use_att: att_feat = self.att_loader.get(str(self.info["images"][ix]["id"])) # Reshape to K x C att_feat = att_feat.reshape(-1, att_feat.shape[-1]) if self.norm_att_feat: att_feat = att_feat / np_norm(att_feat, 2, 1, keepdims=True) if self.use_box: box_feat = self.box_loader.get( str(self.info["images"][ix]["id"])) # devided by image width and height x1, y1, x2, y2 = np_hsplit(box_feat, 4) h, w = ( self.info["images"][ix]["height"], self.info["images"][ix]["width"], ) box_feat = np_hstack( (x1 / w, y1 / h, x2 / w, y2 / h, (x2 - x1) * (y2 - y1) / (w * h))) # question? x2-x1+1?? if self.norm_box_feat: box_feat = box_feat / np_norm( box_feat, 2, 1, keepdims=True) att_feat = np_hstack([att_feat, box_feat]) # sort the features by the size of boxes att_feat = np_stack( sorted(att_feat, key=lambda x: x[-1], reverse=True)) else: att_feat = np_zeros((0, 0), dtype="float32") if self.use_fc: try: fc_feat = self.fc_loader.get(str( self.info["images"][ix]["id"])) except: # Use average of attention when there is no fc provided (For bottomup feature) fc_feat = att_feat.mean(0) else: fc_feat = np_zeros((0), dtype="float32") if hasattr(self, "h5_label_file"): seq = self.get_captions(ix, self.necessary_num_img_captions) else: seq = None return (fc_feat, att_feat, seq, ix, it_pos_now, wrapped)
def vector(self, sentence): v = np_zeros(len(self.vocab), dtype=int) for word in sentence.split(' '): for i, _word in enumerate(self.vocab): if _word == word: # print(_word) v[i] = 1 return v
def _vectorize_text(self, text): vectors = [] for word in text.split(): # if there's no word2vec vector for this word, put in a vec of all 0 try: vectors.append(self.word_vectors.word_vec(word)) except: vectors.append(np_zeros(self.word_vec_size)) return vectors
def __init_statistics(self): stats = self.raw_stats if stats is not None: combined = np_array([[ int(team), stats['oprs'][team], stats['dprs'][team], stats['ccwms'][team] ] for team in stats['oprs'].keys()], np_object) else: teams = self.get_team()[:, 0] num_teams = len(teams) combined = np_rot90( np_array([ teams, np_zeros(num_teams), np_zeros(num_teams), np_zeros(num_teams) ], np_object))[::-1] self.stats = combined
def lcsstr(self, src, tar): """Return the longest common substring of two strings. Longest common substring (LCSstr). Based on the code from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring :cite:`Wikibooks:2018`. This is licensed Creative Commons: Attribution-ShareAlike 3.0. Modifications include: - conversion to a numpy array in place of a list of lists - conversion to Python 2/3-safe range from xrange via six Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- str The longest common substring Examples -------- >>> sstr = LCSstr() >>> sstr.lcsstr('cat', 'hat') 'at' >>> sstr.lcsstr('Niall', 'Neil') 'N' >>> sstr.lcsstr('aluminum', 'Catalan') 'al' >>> sstr.lcsstr('ATCG', 'TAGC') 'A' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) longest, i_longest = 0, 0 for i in range(1, len(src) + 1): for j in range(1, len(tar) + 1): if src[i - 1] == tar[j - 1]: lengths[i, j] = lengths[i - 1, j - 1] + 1 if lengths[i, j] > longest: longest = lengths[i, j] i_longest = i else: lengths[i, j] = 0 return src[i_longest - longest : i_longest]
def _coding_mask(self, seq_id): """Build mask indicating which bases in a sequences are coding.""" # safe way to calculate coding bases as it accounts # for the potential of overlapping genes coding_mask = np_zeros(self.last_coding_base[seq_id]) for pos in self.genes[seq_id].values(): coding_mask[pos[0]:pos[1] + 1] = 1 return coding_mask
def sim_score(self, src, tar): """Return the SAPS similarity between two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int The SAPS similarity between src & tar Examples -------- >>> cmp = SAPS() >>> cmp.sim_score('cat', 'hat') 0 >>> cmp.sim_score('Niall', 'Neil') 3 >>> cmp.sim_score('aluminum', 'Catalan') -11 >>> cmp.sim_score('ATCG', 'TAGC') -1 >>> cmp.sim_score('Stevenson', 'Stinson') 16 .. versionadded:: 0.4.0 """ src = self._tokenizer.tokenize(src).get_list() tar = self._tokenizer.tokenize(tar).get_list() src = ''.join([_[0].upper() + _[1:].lower() for _ in src]) tar = ''.join([_[0].upper() + _[1:].lower() for _ in tar]) d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) for i in range(len(src)): d_mat[i + 1, 0] = d_mat[i, 0] + self._g(src[i]) for j in range(len(tar)): d_mat[0, j + 1] = d_mat[0, j] + self._g(tar[j]) for i in range(len(src)): for j in range(len(tar)): d_mat[i + 1, j + 1] = max( d_mat[i, j + 1] + self._g(src[i]), # ins d_mat[i + 1, j] + self._g(tar[j]), # del d_mat[i, j] + self._s(src[i], tar[j]), # sub/== ) return d_mat[len(src), len(tar)]
def add_reference_surface(self, xmin, xmax, ymin, ymax, image): cx = np_linspace(xmin/self.kx,xmax/self.kx,image.shape[1]) cy = np_linspace(ymin/self.ky,ymax/self.ky,image.shape[0]) cz = np_zeros((image.shape[1],image.shape[0])) ref_tex = pg_makeRGBA(np_rot90(image, k=3))[0]/255. self.ref_surf = gl.GLSurfacePlotItem(x=cx, y=cy, z=cz, colors = ref_tex, shader='balloon') self.ref_surf.translate(-self.xoff,-self.yoff,self.zoff) self.addItem(self.ref_surf) return self.ref_surf
def transformCP(self, silent=False, nolog=False, min=None, max=None): """Do the main ransformation on the coverage profile data""" shrinkFn = np_log10 if(nolog): shrinkFn = lambda x:x s = (self.numContigs,3) self.transformedCP = np_zeros(s) if(not silent): print " Dimensionality reduction" # get the median distance from the origin unit_vectors = [(np_cos(i*2*np_pi/self.numStoits),np_sin(i*2*np_pi/self.numStoits)) for i in range(self.numStoits)] for i in range(len(self.indices)): norm = np_norm(self.covProfiles[i]) if(norm != 0): radial = shrinkFn(norm) else: radial = norm shifted_vector = np_array([0.0,0.0]) flat_vector = (self.covProfiles[i] / sum(self.covProfiles[i])) for j in range(self.numStoits): shifted_vector[0] += unit_vectors[j][0] * flat_vector[j] shifted_vector[1] += unit_vectors[j][1] * flat_vector[j] # log scale it towards the centre scaling_vector = shifted_vector * self.scaleFactor sv_size = np_norm(scaling_vector) if(sv_size > 1): shifted_vector /= shrinkFn(sv_size) self.transformedCP[i,0] = shifted_vector[0] self.transformedCP[i,1] = shifted_vector[1] self.transformedCP[i,2] = radial if(not silent): print " Reticulating splines" # finally scale the matrix to make it equal in all dimensions if(min is None): min = np_amin(self.transformedCP, axis=0) max = np_amax(self.transformedCP, axis=0) max = max - min max = max / (self.scaleFactor-1) for i in range(0,3): self.transformedCP[:,i] = (self.transformedCP[:,i] - min[i])/max[i] return(min,max)
def _parse_data(self, infile): data = {} with open(infile) as fp: fp.readline() genomes = set() for line in fp: fields = line.rstrip().split('\t') fields[0] = re.sub(r'_genes$', "", fields[0]) fields[2] = re.sub(r'_genes$', "", fields[2]) genomes.add(fields[0]) genomes.add(fields[2]) try: data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])] except KeyError: data[fields[0]] = {} data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])] except IndexError as e: print(fields) raise e self.perc_ids = np_zeros([len(genomes), len(genomes)]) self.perc_aln = np_zeros([len(genomes), len(genomes)]) genome_to_index = {} self.genomes = [None] * len(genomes) for n, g in enumerate(alphanumeric_sort(genomes)): genome_to_index[g] = n self.genomes[n] = g self.genomes = np_array(self.genomes) for g1, g2 in permutations(genomes, 2): try: self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g1][g2][0] self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g1][g2][1] except: self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g2][g1][0] self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g2][g1][1]
def __init__(self, dbFileName, plot=False, force=False, numImgMaps=1): # worker classes self.PM = ProfileManager(dbFileName) # store our data self.BM = BinManager(pm=self.PM) # store our bins # heat maps self.numImgMaps = numImgMaps self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor)) self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor)) # we need a way to reference from the imageMaps back onto the transformed data self.im2RowIndicies = {} # When blurring the raw image maps I chose a radius to suit my data, you can vary this as you like self.blurRadius = 2 self.span = 30 # amount we can travel about when determining "hot spots" # misc self.minSize = 10 # Min number of contigs for a bin to be considered legit self.minVol = 1000000 # Override on the min size, if we have this many BP self.forceWriting = force self.debugPlots = plot self.imageCounter = 1 # when we print many images self.roundNumber = 0 # how many times have we tried to make a bin?
def array2PETScVec(v): """ Converts (copies) a sequential array/vector on process 0 to a distributed PETSc Vec input : v, numpy array on proc 0, None (or whatever) on other proc output: PETSc Vec distributed on all procs """ comm = MPI.COMM_WORLD rank = comm.Get_rank() # v is (probably) only redefined on proc 0 if rank == 0: n = len(v) else: n = None n = comm.bcast(n, root = 0) #print "DEBUG", __name__, "rank=", rank, "n=", n x = PETSc.Vec() x.create(comm) x.setSizes(n) x.setFromOptions() istart,iend = x.getOwnershipRange() nloc = iend - istart Istart = comm.gather(istart,root = 0) Iend = comm.gather(iend ,root = 0) vloc = np_zeros(nloc,PETSc.ScalarType) if rank == 0: vloc[:nloc ] = v[:nloc] for iproc in range(1,comm.size): if rank == 0: i0 = Istart[iproc] i1 = Iend [iproc] comm.Send(v[i0:i1], dest=iproc, tag=77) elif rank == iproc: comm.Recv(vloc, source=0, tag=77) x.setArray(vloc) return x
def PETScVec2array(x): """ Converts (copies) a distributed PETSc Vec to a sequential array on process 0 input : x, PETSc Vec distributed on all procs output: numpy array on proc 0 """ comm = MPI.COMM_WORLD rank = comm.Get_rank() vloc = x.getArray() n = x.getSize() istart,iend = x.getOwnershipRange() nloc = iend - istart Istart = comm.gather(istart,root = 0) Iend = comm.gather(iend ,root = 0) if rank == 0: v = np_zeros(n,PETSc.ScalarType) else: v = None if rank == 0: v[:nloc ] = vloc for iproc in range(1,comm.size): if rank == 0: i0 = Istart[iproc] i1 = Iend [iproc] comm.Recv(v[i0:i1], source=iproc, tag=77) elif rank == iproc: comm.Send(vloc, dest=0, tag=77) return v
def __init__(self, orbit, orbit_dict, q_rects = None, roi_movable = False, lock_aspect = True, parent = None, labels = 1, x_label = 'x', y_label = 'y', x_unit = "", y_unit = "", v_offset = (0,0), prefs = None, depth_meas = True, iface = None): super(OrbitViewer, self).__init__(parent) self.plots = [] data_f = [] sim_f = [] self.v_offset = v_offset self.v_offset_data = self.v_offset[0] self.v_offset_sim = self.v_offset[1] self.orbit_label = orbit_dict.get_instrument() + " - Orbit "+str(orbit) self.x_unit = x_unit self.y_unit = y_unit self.orbit_dict=orbit_dict self.prefs = prefs self.iface = iface if orbit_dict.data: for band in orbit_dict.data: data_f.append(np_mean(band,0)) else: for band in orbit_dict.sim: data_f.append(np_zeros(band.shape[1:])) if orbit_dict.sim: for band in orbit_dict.sim: sim_f.append(np_mean(band,0)) else: for band in orbit_dict.data: sim_f.append(np_zeros(band.shape[1:])) ii = 0 for band in orbit_dict.data: depth_cb = CreateDepthLayer(self.orbit_dict, ii, QgsProject.instance().readPath("./"), self.iface) self.plots.append(SinglePlot(images = [data_f[ii], sim_f[ii]], images_label = ["data", "sim"], label_text = self.orbit_label+" Frequency band "+str(ii+1), q_rects = q_rects, roi_movable = roi_movable, lock_aspect = lock_aspect, x_label = x_label, y_label = y_label, x_unit = x_unit, y_unit = y_unit, depth_cb = depth_cb.run, depth_meas = depth_meas)) self.addItem(self.plots[-1], row=0, col=(ii)) ii = ii + 1 self.set_pos_label(0)
def dist_abs(self, src, tar): """Return the Gotoh score of two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float Gotoh score Examples -------- >>> cmp = Gotoh() >>> cmp.dist_abs('cat', 'hat') 2.0 >>> cmp.dist_abs('Niall', 'Neil') 1.0 >>> round(cmp.dist_abs('aluminum', 'Catalan'), 12) -0.4 >>> cmp.dist_abs('cat', 'hat') 2.0 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) q_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) d_mat[0, 0] = 0 p_mat[0, 0] = float('-inf') q_mat[0, 0] = float('-inf') for i in range(1, len(src) + 1): d_mat[i, 0] = float('-inf') p_mat[i, 0] = -self._gap_open - self._gap_ext * (i - 1) q_mat[i, 0] = float('-inf') q_mat[i, 1] = -self._gap_open for j in range(1, len(tar) + 1): d_mat[0, j] = float('-inf') p_mat[0, j] = float('-inf') p_mat[1, j] = -self._gap_open q_mat[0, j] = -self._gap_open - self._gap_ext * (j - 1) for i in range(1, len(src) + 1): for j in range(1, len(tar) + 1): sim_val = self._sim_func(src[i - 1], tar[j - 1]) d_mat[i, j] = max( d_mat[i - 1, j - 1] + sim_val, p_mat[i - 1, j - 1] + sim_val, q_mat[i - 1, j - 1] + sim_val, ) p_mat[i, j] = max( d_mat[i - 1, j] - self._gap_open, p_mat[i - 1, j] - self._gap_ext, ) q_mat[i, j] = max( d_mat[i, j - 1] - self._gap_open, q_mat[i, j - 1] - self._gap_ext, ) i, j = (n - 1 for n in d_mat.shape) return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
def loadData(self, timer, condition, # condition as set by another function bids=[], # if this is set then only load those contigs with these bin ids verbose=True, # many to some output messages silent=False, # some to no output messages loadCovProfiles=True, loadKmerPCs=True, loadKmerVarPC=True, loadRawKmers=False, makeColors=True, loadContigNames=True, loadContigLengths=True, loadContigGCs=True, loadBins=False, loadLinks=False): """Load pre-parsed data""" timer.getTimeStamp() if(silent): verbose=False if verbose: print "Loading data from:", self.dbFileName try: self.numStoits = self.getNumStoits() self.condition = condition self.indices = self.dataManager.getConditionalIndices(self.dbFileName, condition=condition, silent=silent) if(verbose): print " Loaded indices with condition:", condition self.numContigs = len(self.indices) if self.numContigs == 0: print " ERROR: No contigs loaded using condition:", condition return if(not silent): print " Working with: %d contigs" % self.numContigs if(loadCovProfiles): if(verbose): print " Loading coverage profiles" self.covProfiles = self.dataManager.getCoverageProfiles(self.dbFileName, indices=self.indices) self.normCoverages = self.dataManager.getNormalisedCoverageProfiles(self.dbFileName, indices=self.indices) # work out average coverages self.averageCoverages = np_array([sum(i)/self.numStoits for i in self.covProfiles]) if loadRawKmers: if(verbose): print " Loading RAW kmer sigs" self.kmerSigs = self.dataManager.getKmerSigs(self.dbFileName, indices=self.indices) if(loadKmerPCs): self.kmerPCs = self.dataManager.getKmerPCAs(self.dbFileName, indices=self.indices) if(verbose): print " Loading PCA kmer sigs (" + str(len(self.kmerPCs[0])) + " dimensional space)" self.kmerNormPC1 = np_copy(self.kmerPCs[:,0]) self.kmerNormPC1 -= np_min(self.kmerNormPC1) self.kmerNormPC1 /= np_max(self.kmerNormPC1) if(loadKmerVarPC): self.kmerVarPC = self.dataManager.getKmerVarPC(self.dbFileName, indices=self.indices) if(verbose): print " Loading PCA kmer variance (total variance: %.2f" % np_sum(self.kmerVarPC) + ")" if(loadContigNames): if(verbose): print " Loading contig names" self.contigNames = self.dataManager.getContigNames(self.dbFileName, indices=self.indices) if(loadContigLengths): self.contigLengths = self.dataManager.getContigLengths(self.dbFileName, indices=self.indices) if(verbose): print " Loading contig lengths (Total: %d BP)" % ( sum(self.contigLengths) ) if(loadContigGCs): self.contigGCs = self.dataManager.getContigGCs(self.dbFileName, indices=self.indices) if(verbose): print " Loading contig GC ratios (Average GC: %0.3f)" % ( np_mean(self.contigGCs) ) if(makeColors): if(verbose): print " Creating color map" # use HSV to RGB to generate colors S = 1 # SAT and VAL remain fixed at 1. Reduce to make V = 1 # Pastels if that's your preference... self.colorMapGC = self.createColorMapHSV() if(loadBins): if(verbose): print " Loading bin assignments" self.binIds = self.dataManager.getBins(self.dbFileName, indices=self.indices) if len(bids) != 0: # need to make sure we're not restricted in terms of bins bin_stats = self.getBinStats() for bid in bids: try: self.validBinIds[bid] = bin_stats[bid][0] self.isLikelyChimeric[bid]= bin_stats[bid][1] except KeyError: self.validBinIds[bid] = 0 self.isLikelyChimeric[bid]= False else: bin_stats = self.getBinStats() for bid in bin_stats: self.validBinIds[bid] = bin_stats[bid][0] self.isLikelyChimeric[bid] = bin_stats[bid][1] # fix the binned indices self.binnedRowIndices = {} for i in range(len(self.indices)): if(self.binIds[i] != 0): self.binnedRowIndices[i] = True else: # we need zeros as bin indicies then... self.binIds = np_zeros(len(self.indices)) if(loadLinks): self.loadLinks() self.stoitColNames = self.getStoitColNames() except: print "Error loading DB:", self.dbFileName, exc_info()[0] raise
def _group_linkage_intersection(self): r"""Return the group linkage intersection of the tokens in src and tar. This is based on group linkage, as defined by :cite:`On:2007`. Most of this method is concerned with solving the assignment problem, in order to find the weight of the maximum weight bipartite matching. If the system has SciPy installed, we use it's linear_sum_assignment function to get the assignments. Otherwise, we use the Hungarian algorithm of Munkres :cite:`Munkres:1957`, implemented in Python & Numpy. .. versionadded:: 0.4.0 """ intersection = self._crisp_intersection() src_only = sorted(self._src_tokens - self._tar_tokens) tar_only = sorted(self._tar_tokens - self._src_tokens) if linear_sum_assignment and not ( 'internal_assignment_problem' in self.params and self.params['internal_assignment_problem'] ): arr = np_zeros((len(tar_only), len(src_only))) for col in range(len(src_only)): for row in range(len(tar_only)): arr[row, col] = self.params['metric'].dist( src_only[col], tar_only[row] ) for row, col in zip(*linear_sum_assignment(arr)): sim = 1.0 - arr[row, col] if sim >= self.params['threshold']: intersection[src_only[col]] += (sim / 2) * ( self._src_tokens - self._tar_tokens )[src_only[col]] intersection[tar_only[row]] += (sim / 2) * ( self._tar_tokens - self._src_tokens )[tar_only[row]] else: n = max(len(tar_only), len(src_only)) arr = np_zeros((n, n), dtype=float) for col in range(len(src_only)): for row in range(len(tar_only)): arr[row, col] = self.params['metric'].dist( src_only[col], tar_only[row] ) src_only += [''] * (n - len(src_only)) tar_only += [''] * (n - len(tar_only)) orig_sim = 1 - np_copy(arr) # Step 1 for row in range(n): arr[row, :] -= arr[row, :].min() # Step 2 for col in range(n): arr[:, col] -= arr[:, col].min() while True: # Step 3 assignments = {} allocated_cols = set() allocated_rows = set() assigned_rows = set() assigned_cols = set() for row in range(n): if (arr[row, :] == 0.0).sum() == 1: col = arr[row, :].argmin() if col not in allocated_cols: assignments[row, col] = orig_sim[row, col] allocated_cols.add(col) assigned_rows.add(row) assigned_cols.add(col) for col in range(n): if (arr[:, col] == 0.0).sum() == 1: row = arr[:, col].argmin() if row not in allocated_rows: assignments[row, col] = orig_sim[row, col] allocated_rows.add(row) assigned_rows.add(row) assigned_cols.add(col) if len(assignments) == n: break marked_rows = {_ for _ in range(n) if _ not in assigned_rows} marked_cols = set() for row in sorted(set(marked_rows)): for col, mark in enumerate(arr[row, :] == 0.0): if mark: marked_cols.add(col) for row2 in range(n): if (row2, col) in assignments: marked_rows.add(row2) if n - len(marked_rows) + len(marked_cols) == n: # We have sufficient lines for col in range(n): row = arr[:, col].argmin() assignments[row, col] = orig_sim[row, col] break # Step 4 min_val = arr[tuple(marked_rows), :][ :, sorted(set(range(n)) - marked_cols) ].min() for row in range(n): for col in range(n): if row in marked_rows and col not in marked_cols: arr[row, col] -= min_val elif row not in marked_rows and col in marked_cols: arr[row, col] += min_val for row, col in assignments.keys(): sim = orig_sim[row, col] if sim >= self.params['threshold']: intersection[src_only[col]] += (sim / 2) * ( self._src_tokens - self._tar_tokens )[src_only[col]] intersection[tar_only[row]] += (sim / 2) * ( self._tar_tokens - self._src_tokens )[tar_only[row]] return intersection
def findNewClusterCenters(self, ss=0): """Find a putative cluster""" inRange = lambda x, l, u: x >= l and x < u # we work from the top view as this has the base clustering max_index = np_argmax(self.blurredMaps[0]) max_value = self.blurredMaps[0].ravel()[max_index] max_x = int(max_index / self.PM.scaleFactor) max_y = max_index - self.PM.scaleFactor * max_x max_z = -1 ret_values = [max_value, max_x, max_y] start_span = int(1.5 * self.span) span_len = 2 * start_span + 1 if self.debugPlots: self.plotRegion(max_x, max_y, max_z, fileName="Image_" + str(self.imageCounter), tag="column", column=True) self.imageCounter += 1 # make a 3d grid to hold the values working_block = np_zeros((span_len, span_len, self.PM.scaleFactor)) # go through the entire column (x_lower, x_upper) = self.makeCoordRanges(max_x, start_span) (y_lower, y_upper) = self.makeCoordRanges(max_y, start_span) super_putative_row_indices = [] for p in self.im2RowIndicies: if inRange(p[0], x_lower, x_upper) and inRange(p[1], y_lower, y_upper): for row_index in self.im2RowIndicies[p]: # check that the point is real and that it has not yet been binned if row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies: # this is an unassigned point. multiplier = np_log10(self.PM.contigLengths[row_index]) self.incrementAboutPoint3D( working_block, p[0] - x_lower, p[1] - y_lower, p[2], multiplier=multiplier ) super_putative_row_indices.append(row_index) # blur and find the highest value bwb = ndi.gaussian_filter(working_block, 8) # self.blurRadius) densest_index = np_unravel_index(np_argmax(bwb), (np_shape(bwb))) max_x = densest_index[0] + x_lower max_y = densest_index[1] + y_lower max_z = densest_index[2] # now get the basic color of this dense point putative_center_row_indices = [] (x_lower, x_upper) = self.makeCoordRanges(max_x, self.span) (y_lower, y_upper) = self.makeCoordRanges(max_y, self.span) (z_lower, z_upper) = self.makeCoordRanges(max_z, 2 * self.span) for row_index in super_putative_row_indices: p = np_around(self.PM.transformedCP[row_index]) if inRange(p[0], x_lower, x_upper) and inRange(p[1], y_lower, y_upper) and inRange(p[2], z_lower, z_upper): # we are within the range! putative_center_row_indices.append(row_index) # make sure we have something to go on here if np_size(putative_center_row_indices) == 0: # it's all over! return None if np_size(putative_center_row_indices) == 1: # get out of here but keep trying # the calling function may restrict these indices return [[np_array(putative_center_row_indices)], ret_values] else: total_BP = sum([self.PM.contigLengths[i] for i in putative_center_row_indices]) if not self.isGoodBin(total_BP, len(putative_center_row_indices), ms=5): # Can we trust very small bins?. # get out of here but keep trying # the calling function should restrict these indices return [[np_array(putative_center_row_indices)], ret_values] else: # we've got a few good guys here, partition them up! # shift these guys around a bit center_k_vals = np_array([self.PM.kmerVals[i] for i in putative_center_row_indices]) k_partitions = self.partitionVals(center_k_vals) if len(k_partitions) == 0: return None else: center_c_vals = np_array([self.PM.transformedCP[i][-1] for i in putative_center_row_indices]) # center_c_vals = np_array([self.PM.averageCoverages[i] for i in putative_center_row_indices]) center_c_vals -= np_min(center_c_vals) c_max = np_max(center_c_vals) if c_max != 0: center_c_vals /= c_max c_partitions = self.partitionVals(center_c_vals) # take the intersection of the two partitions tmp_partition_hash_1 = {} id = 1 for p in k_partitions: for i in p: tmp_partition_hash_1[i] = id id += 1 tmp_partition_hash_2 = {} id = 1 for p in c_partitions: for i in p: try: tmp_partition_hash_2[(tmp_partition_hash_1[i], id)].append(i) except KeyError: tmp_partition_hash_2[(tmp_partition_hash_1[i], id)] = [i] id += 1 partitions = [ np_array([putative_center_row_indices[i] for i in tmp_partition_hash_2[key]]) for key in tmp_partition_hash_2.keys() ] # pcs = [[self.PM.averageCoverages[i] for i in p] for p in partitions] # print pcs return [partitions, ret_values]
def csrmatrix2PETScMat(L): """ Converts a sequential scipy sparse matrix (on process 0) to a PETSc Mat ('aij') matrix distributed on all processes input : L, scipy sparse matrix on proc 0 output: PETSc matrix distributed on all procs """ comm = MPI.COMM_WORLD rank = comm.Get_rank() # Get the data from the sequential scipy matrix if rank == 0: if L.format == 'csr': L2 = L else: L2 = L.tocsr() Ai = L2.indptr Aj = L2.indices Av = L2.data nnz = len(Aj) n,m = L2.shape else: n = None m = None nnz = None Ai = None Aj = None Av = None # Broadcast sizes n = comm.bcast(n ,root = 0) m = comm.bcast(m ,root = 0) nnz = comm.bcast(nnz,root = 0) B = PETSc.Mat() B.create(comm) B.setSizes([n, m]) B.setType('aij') B.setFromOptions() # Create a vector to get the local sizes, so that preallocation can be done later V = PETSc.Vec() V.create(comm) V.setSizes(n) V.setFromOptions() istart,iend = V.getOwnershipRange() V.destroy() nloc = iend - istart Istart = comm.gather(istart,root = 0) Iend = comm.gather(iend ,root = 0) if rank == 0: nnzloc = np_zeros(comm.size,'int') for i in range(comm.size): j0 = Ai[Istart[i]] j1 = Ai[Iend [i]] nnzloc[i] = j1 - j0 else: nnzloc = None nnzloc = comm.scatter(nnzloc,root = 0) ai = np_zeros(nloc+1 ,PETSc.IntType) aj = np_zeros(nnzloc+1 ,PETSc.IntType) av = np_zeros(nnzloc+1 ,PETSc.ScalarType) if rank == 0: j0 = Ai[Istart[0]] j1 = Ai[Iend [0]] ai[:nloc ] = Ai[:nloc] aj[:nnzloc] = Aj[j0:j1] av[:nnzloc] = Av[j0:j1] for iproc in range(1,comm.size): if rank == 0: i0 = Istart[iproc] i1 = Iend [iproc] j0 = Ai[i0] j1 = Ai[i1] comm.Send(Ai[i0:i1], dest=iproc, tag=77) comm.Send(Aj[j0:j1], dest=iproc, tag=78) comm.Send(Av[j0:j1], dest=iproc, tag=79) elif rank == iproc: comm.Recv(ai[:nloc ], source=0, tag=77) comm.Recv(aj[:nnzloc], source=0, tag=78) comm.Recv(av[:nnzloc], source=0, tag=79) ai = ai- ai[0] ai[-1] = nnzloc+1 B.setPreallocationCSR((ai,aj)) B.setValuesCSR(ai,aj,av) B.assemble() return B
def zeros(n, type='float32'): return(np_zeros(n, type))
def alignment(self, src, tar, score_only=False): """Return the ALINE alignments of two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison score_only : bool Return the score only, not the alignments Returns ------- list(tuple(float, str, str) or float ALINE alignments and their scores or the top score Examples -------- >>> cmp = ALINE() >>> cmp.alignment('cat', 'hat') [(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')] >>> cmp.alignment('niall', 'neil') [(90.0, '‖ n i a ll ‖', '‖ n e i l ‖')] >>> cmp.alignment('aluminum', 'catalan') [(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')] >>> cmp.alignment('atcg', 'tagc') [(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖'), (65.0, 'a ‖ tc - g ‖', '‖ t a g ‖ c')] .. versionadded:: 0.4.0 """ def _sig_skip(seg): return self._c_skip def _sig_sub(seg1, seg2): return ( self._c_sub - _delta(seg1, seg2) - _sig_vwl(seg1) - _sig_vwl(seg2) ) def _sig_exp(seg1, seg2a, seg2b): return ( self._c_exp - _delta(seg1, seg2a) - _delta(seg1, seg2b) - _sig_vwl(seg1) - max(_sig_vwl(seg2a), _sig_vwl(seg2b)) ) def _sig_vwl(seg): return ( 0.0 if seg['manner'] > self.feature_weights['high vowel'] else self._c_vwl ) def _delta(seg1, seg2): features = ( self.c_features if max(seg1['manner'], seg2['manner']) > self.feature_weights['high vowel'] else self.v_features ) diff = 0.0 for f in features: diff += ( abs(seg1.get(f, 0.0) - seg2.get(f, 0.0)) * self.salience[f] ) return diff def _retrieve(i, j, score, out): def _record(score, out): out.append(('‖', '‖')) for i1 in range(i - 1, -1, -1): out.append((src[i1]['segment'], '')) for j1 in range(j - 1, -1, -1): out.append(('', tar[j1]['segment'])) if self._mode == 'global': score += (i + j) * _sig_skip('') out = out[::-1] src_alignment = [] tar_alignment = [] out.append(('‖', '‖')) part = 0 s_segment = '' t_segment = '' for ss, ts in out: if ss == '‖': if part % 2 == 0: src_alignment.append(s_segment) tar_alignment.append(t_segment) s_segment = [] t_segment = [] else: src_alignment.append(' '.join(s_segment)) tar_alignment.append(' '.join(t_segment)) s_segment = '' t_segment = '' part += 1 else: if part % 2 == 0: s_segment += ss t_segment += ts else: s_segment.append(ss + ' ' * (len(ts) - len(ss))) t_segment.append(ts + ' ' * (len(ss) - len(ts))) src_alignment = ' ‖ '.join(src_alignment).strip() tar_alignment = ' ‖ '.join(tar_alignment).strip() alignments.append((score, src_alignment, tar_alignment)) return if s_mat[i, j] == 0: _record(score, out) return else: if ( i > 0 and j > 0 and s_mat[i - 1, j - 1] + _sig_sub(src[i - 1], tar[j - 1]) + score >= threshold ): loc_out = deepcopy(out) loc_out.append( (src[i - 1]['segment'], tar[j - 1]['segment']) ) _retrieve( i - 1, j - 1, score + _sig_sub(src[i - 1], tar[j - 1]), loc_out, ) loc_out.pop() if ( j > 0 and s_mat[i, j - 1] + _sig_skip(tar[j - 1]) + score >= threshold ): loc_out = deepcopy(out) loc_out.append(('-', tar[j - 1]['segment'])) _retrieve(i, j - 1, score + _sig_skip(tar[j - 1]), loc_out) loc_out.pop() if ( i > 0 and j > 1 and s_mat[i - 1, j - 2] + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]) + score >= threshold ): loc_out = deepcopy(out) loc_out.append( ( src[i - 1]['segment'], tar[j - 2]['segment'] + tar[j - 1]['segment'], ) ) _retrieve( i - 1, j - 2, score + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]), loc_out, ) loc_out.pop() if ( i > 0 and s_mat[i - 1, j] + _sig_skip(src[i - 1]) + score >= threshold ): loc_out = deepcopy(out) loc_out.append((src[i - 1]['segment'], '-')) _retrieve(i - 1, j, score + _sig_skip(src[i - 1]), loc_out) loc_out.pop() if ( i > 1 and j > 0 and s_mat[i - 2, j - 1] + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]) + score >= threshold ): loc_out = deepcopy(out) loc_out.append( ( src[i - 2]['segment'] + src[i - 1]['segment'], tar[j - 1]['segment'], ) ) _retrieve( i - 2, j - 1, score + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]), loc_out, ) loc_out.pop() sg_max = 0.0 src = list(src) tar = list(tar) for ch in range(len(src)): if src[ch] in self._phones: seg = src[ch] src[ch] = dict(self._phones[src[ch]]) src[ch]['segment'] = seg for ch in range(len(tar)): if tar[ch] in self._phones: seg = tar[ch] tar[ch] = dict(self._phones[tar[ch]]) tar[ch]['segment'] = seg src = [fb for fb in src if isinstance(fb, dict)] tar = [fb for fb in tar if isinstance(fb, dict)] for i in range(1, len(src)): if 'supplemental' in src[i]: j = i - 1 while j > -1: if 'supplemental' not in src[j]: for key, value in src[i].items(): if key != 'supplemental': if key == 'segment': src[j]['segment'] += value else: src[j][key] = value j = 0 j -= 1 src = [fb for fb in src if 'supplemental' not in fb] for i in range(1, len(tar)): if 'supplemental' in tar[i]: j = i - 1 while j > -1: if 'supplemental' not in tar[j]: for key, value in tar[i].items(): if key != 'supplemental': if key == 'segment': tar[j]['segment'] += value else: tar[j][key] = value j = 0 j -= 1 tar = [fb for fb in tar if 'supplemental' not in fb] for i in range(len(src)): for key in src[i].keys(): if key != 'segment': src[i][key] = self.feature_weights[src[i][key]] for i in range(len(tar)): for key in tar[i].keys(): if key != 'segment': tar[i][key] = self.feature_weights[tar[i][key]] src_len = len(src) tar_len = len(tar) s_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) if self._mode == 'global': for i in range(1, src_len + 1): s_mat[i, 0] = s_mat[i - 1, 0] + _sig_skip(src[i - 1]) for j in range(1, tar_len + 1): s_mat[0, j] = s_mat[0, j - 1] + _sig_skip(tar[j - 1]) for i in range(1, src_len + 1): for j in range(1, tar_len + 1): s_mat[i, j] = max( s_mat[i - 1, j] + _sig_skip(src[i - 1]), s_mat[i, j - 1] + _sig_skip(tar[j - 1]), s_mat[i - 1, j - 1] + _sig_sub(src[i - 1], tar[j - 1]), s_mat[i - 1, j - 2] + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]) if j > 1 else NINF, s_mat[i - 2, j - 1] + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]) if i > 1 else NINF, 0 if self._mode in {'local', 'half-local'} else NINF, ) if s_mat[i, j] > sg_max: if self._mode == 'semi-global': if i == src_len or j == tar_len: sg_max = s_mat[i, j] else: sg_max = s_mat[i, j] if self._mode in {'global', 'half-local'}: dp_score = s_mat[src_len, tar_len] else: dp_score = s_mat.max() if score_only: return dp_score threshold = (1 - self._epsilon) * dp_score alignments = [] for i in range(1, src_len + 1): for j in range(1, tar_len + 1): if self._mode in {'global', 'half-local'} and ( i < src_len or j < tar_len ): continue if self._mode == 'semi-global' and ( i < src_len and j < tar_len ): continue if s_mat[i, j] >= threshold: out = [] for j1 in range(tar_len - 1, j - 1, -1): out.append(('', tar[j1]['segment'])) for i1 in range(src_len - 1, i - 1, -1): out.append((src[i1]['segment'], '')) out.append(('‖', '‖')) _retrieve(i, j, 0, out) def _first_element(x): return x[0] return sorted(alignments, key=_first_element, reverse=True)
def encode(self, C, maxlen): """Encode as one-hot""" X = np_zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member for i, c in enumerate(C): X[i, self.char_indices[c]] = 1 return X
def dist_abs(self, src, tar): """Return the Editex distance between two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int Editex distance Examples -------- >>> cmp = Editex() >>> cmp.dist_abs('cat', 'hat') 2 >>> cmp.dist_abs('Niall', 'Neil') 2 >>> cmp.dist_abs('aluminum', 'Catalan') 12 >>> cmp.dist_abs('ATCG', 'TAGC') 6 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ match_cost, group_cost, mismatch_cost = self._cost def r_cost(ch1, ch2): """Return r(a,b) according to Zobel & Dart's definition. Parameters ---------- ch1 : str The first character to compare ch2 : str The second character to compare Returns ------- int r(a,b) according to Zobel & Dart's definition .. versionadded:: 0.1.0 """ if ch1 == ch2: return match_cost if ch1 in self._all_letters and ch2 in self._all_letters: for group in self._letter_groups: if ch1 in group and ch2 in group: return group_cost return mismatch_cost def d_cost(ch1, ch2): """Return d(a,b) according to Zobel & Dart's definition. Parameters ---------- ch1 : str The first character to compare ch2 : str The second character to compare Returns ------- int d(a,b) according to Zobel & Dart's definition .. versionadded:: 0.1.0 """ if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'): return group_cost return r_cost(ch1, ch2) # convert both src & tar to NFKD normalized unicode src = unicode_normalize('NFKD', text_type(src.upper())) tar = unicode_normalize('NFKD', text_type(tar.upper())) # convert ß to SS (for Python2) src = src.replace('ß', 'SS') tar = tar.replace('ß', 'SS') src_len = len(src) tar_len = len(tar) max_len = max(src_len, tar_len) if src == tar: return 0.0 if not src: return sum( mismatch_cost * self._taper(pos, max_len) for pos in range(tar_len) ) if not tar: return sum( mismatch_cost * self._taper(pos, max_len) for pos in range(src_len) ) d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) src = ' ' + src tar = ' ' + tar if not self._local: for i in range(1, src_len + 1): d_mat[i, 0] = d_mat[i - 1, 0] + d_cost( src[i - 1], src[i] ) * self._taper(i, max_len) for j in range(1, tar_len + 1): d_mat[0, j] = d_mat[0, j - 1] + d_cost( tar[j - 1], tar[j] ) * self._taper(j, max_len) for i in range(1, src_len + 1): for j in range(1, tar_len + 1): d_mat[i, j] = min( d_mat[i - 1, j] + d_cost(src[i - 1], src[i]) * self._taper(max(i, j), max_len), d_mat[i, j - 1] + d_cost(tar[j - 1], tar[j]) * self._taper(max(i, j), max_len), d_mat[i - 1, j - 1] + r_cost(src[i], tar[j]) * self._taper(max(i, j), max_len), ) if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]: return int(d_mat[src_len, tar_len]) else: return d_mat[src_len, tar_len]
def blurMaps(self): """Blur the 2D image maps""" self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor)) for i in range(self.numImgMaps): # top, front and side self.blurredMaps[i, :, :] = ndi.gaussian_filter(self.imageMaps[i, :, :], 8) # self.blurRadius)
def dist_abs(self, src, tar): """Return the Damerau-Levenshtein distance between two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int (may return a float if cost has float values) The Damerau-Levenshtein distance between src & tar Raises ------ ValueError Unsupported cost assignment; the cost of two transpositions must not be less than the cost of an insert plus a delete. Examples -------- >>> cmp = DamerauLevenshtein() >>> cmp.dist_abs('cat', 'hat') 1 >>> cmp.dist_abs('Niall', 'Neil') 3 >>> cmp.dist_abs('aluminum', 'Catalan') 7 >>> cmp.dist_abs('ATCG', 'TAGC') 2 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ ins_cost, del_cost, sub_cost, trans_cost = self._cost if src == tar: return 0 if not src: return len(tar) * ins_cost if not tar: return len(src) * del_cost if 2 * trans_cost < ins_cost + del_cost: raise ValueError( 'Unsupported cost assignment; the cost of two transpositions ' + 'must not be less than the cost of an insert plus a delete.' ) d_mat = np_zeros((len(src), len(tar)), dtype=np_int) if src[0] != tar[0]: d_mat[0, 0] = min(sub_cost, ins_cost + del_cost) src_index_by_character = {src[0]: 0} for i in range(1, len(src)): del_distance = d_mat[i - 1, 0] + del_cost ins_distance = (i + 1) * del_cost + ins_cost match_distance = i * del_cost + ( 0 if src[i] == tar[0] else sub_cost ) d_mat[i, 0] = min(del_distance, ins_distance, match_distance) for j in range(1, len(tar)): del_distance = (j + 1) * ins_cost + del_cost ins_distance = d_mat[0, j - 1] + ins_cost match_distance = j * ins_cost + ( 0 if src[0] == tar[j] else sub_cost ) d_mat[0, j] = min(del_distance, ins_distance, match_distance) for i in range(1, len(src)): max_src_letter_match_index = 0 if src[i] == tar[0] else -1 for j in range(1, len(tar)): candidate_swap_index = ( -1 if tar[j] not in src_index_by_character else src_index_by_character[tar[j]] ) j_swap = max_src_letter_match_index del_distance = d_mat[i - 1, j] + del_cost ins_distance = d_mat[i, j - 1] + ins_cost match_distance = d_mat[i - 1, j - 1] if src[i] != tar[j]: match_distance += sub_cost else: max_src_letter_match_index = j if candidate_swap_index != -1 and j_swap != -1: i_swap = candidate_swap_index if i_swap == 0 and j_swap == 0: pre_swap_cost = 0 else: pre_swap_cost = d_mat[ max(0, i_swap - 1), max(0, j_swap - 1) ] swap_distance = ( pre_swap_cost + (i - i_swap - 1) * del_cost + (j - j_swap - 1) * ins_cost + trans_cost ) else: swap_distance = maxsize d_mat[i, j] = min( del_distance, ins_distance, match_distance, swap_distance ) src_index_by_character[src[i]] = i return d_mat[len(src) - 1, len(tar) - 1]
def dist_abs(self, src, tar): """Return the Levenshtein distance between two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int (may return a float if cost has float values) The Levenshtein distance between src & tar Examples -------- >>> cmp = Levenshtein() >>> cmp.dist_abs('cat', 'hat') 1 >>> cmp.dist_abs('Niall', 'Neil') 3 >>> cmp.dist_abs('aluminum', 'Catalan') 7 >>> cmp.dist_abs('ATCG', 'TAGC') 3 >>> cmp = Levenshtein(mode='osa') >>> cmp.dist_abs('ATCG', 'TAGC') 2 >>> cmp.dist_abs('ACTG', 'TAGC') 4 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ ins_cost, del_cost, sub_cost, trans_cost = self._cost src_len = len(src) tar_len = len(tar) max_len = max(src_len, tar_len) if src == tar: return 0 if not src: return sum( ins_cost * self._taper(pos, max_len) for pos in range(tar_len) ) if not tar: return sum( del_cost * self._taper(pos, max_len) for pos in range(src_len) ) d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) for i in range(src_len + 1): d_mat[i, 0] = i * self._taper(i, max_len) * del_cost for j in range(tar_len + 1): d_mat[0, j] = j * self._taper(j, max_len) * ins_cost for i in range(src_len): for j in range(tar_len): d_mat[i + 1, j + 1] = min( d_mat[i + 1, j] + ins_cost * self._taper(1 + max(i, j), max_len), # ins d_mat[i, j + 1] + del_cost * self._taper(1 + max(i, j), max_len), # del d_mat[i, j] + ( sub_cost * self._taper(1 + max(i, j), max_len) if src[i] != tar[j] else 0 ), # sub/== ) if self._mode == 'osa': if ( i + 1 > 1 and j + 1 > 1 and src[i] == tar[j - 1] and src[i - 1] == tar[j] ): # transposition d_mat[i + 1, j + 1] = min( d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + trans_cost * self._taper(1 + max(i, j), max_len), ) if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]: return int(d_mat[src_len, tar_len]) else: return d_mat[src_len, tar_len]