Example #1
0
def vectorize(questions, answers, chars=None):
    """Vectorize the questions and expected answers"""
    print('Vectorization...')
    chars = chars or CHARS
    x_maxlen = max(len(question) for question in questions)
    y_maxlen = max(len(answer) for answer in answers)
    #     print (len(questions), x_maxlen, len(chars))
    len_of_questions = len(questions)
    ctable = CharacterTable(chars)
    print("X = np_zeros")
    X = np_zeros((len_of_questions, x_maxlen, len(chars)), dtype=np.bool)
    print("for i, sentence in enumerate(questions):")
    for i in xrange(len(questions)):
        sentence = questions.pop()
        for j, c in enumerate(sentence):
            X[i, j, ctable.char_indices[c]] = 1
    print("y = np_zeros")
    y = np_zeros((len_of_questions, y_maxlen, len(chars)), dtype=np.bool)
    print("for i, sentence in enumerate(answers):")
    for i in xrange(len(answers)):
        sentence = answers.pop()
        for j, c in enumerate(sentence):
            y[i, j, ctable.char_indices[c]] = 1

    # Explicitly set apart 10% for validation data that we never train over
    split_at = int(len(X) - len(X) / 10)
    (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))
    (y_train, y_val) = (y[:split_at], y[split_at:])

    print(X_train.shape)
    print(y_train.shape)

    return X_train, X_val, y_train, y_val, y_maxlen, ctable
Example #2
0
 def get_fit(self,
             fit_type,
             data,
             order,
             smooth,
             degree,
             begin,
             end,
             weight=None):
     z1 = np_zeros(begin)
     z2 = np_zeros(len(data[0]) - end)
     if end == 0:
         end = None
     x = data[0][begin:end]
     y = data[1][begin:end]
     if weight is not None:
         weight = weight[begin:end]
     if fit_type == "spline":
         f = inter.UnivariateSpline(x, y, w=weight, k=order, s=smooth)
     else:
         f = poly.Chebyshev.fit(x, y, degree, w=weight)
     res = y - f(x)
     nfit = f(x) / np_max(f(x))
     corr = concatenate((z1, nfit, z2))
     fitc = [x, f(x), corr, res]
     return fitc
Example #3
0
    def pca(self, data_matrix):
        """Perform PCA.

        Principal components are given in self.pca,
        and the variance in self.variance.

        Parameters
        ----------
        data_matrix : list of lists
          List of tetranucleotide signatures
        """

        cols = len(data_matrix[0])
        data_matrix = np_reshape(np_array(data_matrix), (len(data_matrix), cols))

        pca = PCA()
        pc, variance = pca.pca_matrix(data_matrix, 3, bCenter=True, bScale=False)

        # ensure pc matrix has at least 3 dimensions
        if pc.shape[1] == 1:
            pc = np_append(pc, np_zeros((pc.shape[0], 2)), 1)
            variance = np_append(variance[0], np_ones(2))
        elif pc.shape[1] == 2:
            pc = np_append(pc, np_zeros((pc.shape[0], 1)), 1)
            variance = np_append(variance[0:2], np_ones(1))

        return pc, variance
Example #4
0
    def pca(self, data_matrix):
        """Perform PCA.

        Principal components are given in self.pca,
        and the variance in self.variance.

        Parameters
        ----------
        data_matrix : list of lists
          List of tetranucleotide signatures
        """

        cols = len(data_matrix[0])
        data_matrix = np_reshape(np_array(data_matrix),
                                 (len(data_matrix), cols))

        pca = PCA()
        pc, variance = pca.pca_matrix(data_matrix,
                                      3,
                                      bCenter=True,
                                      bScale=False)

        # ensure pc matrix has at least 3 dimensions
        if pc.shape[1] == 1:
            pc = np_append(pc, np_zeros((pc.shape[0], 2)), 1)
            variance = np_append(variance[0], np_ones(2))
        elif pc.shape[1] == 2:
            pc = np_append(pc, np_zeros((pc.shape[0], 1)), 1)
            variance = np_append(variance[0:2], np_ones(1))

        return pc, variance
def non_iter_ls_inv_stft(stft_object):
    stft_data = stft_object['stft']
    origSigSize = stft_object['origSigSize']
    num_rows, _, _ = origSigSize
    shift_length = stft_object['shift_length']
    len_each_section, num_rows_overlap, _, _ = stft_data.shape
    # TODO: Isn't this just num_rows in the very beginning?
    # total_new_elements = (num_rows_overlap - 1) * shift_length + len_each_section
    win_info = stft_object['win_info']
    wVec = win_info(len_each_section)
    wVecSq = wVec**2
    vecC = np_arange(1, num_rows_overlap * shift_length, step=shift_length)
    # vecC = range(0, num_rows_overlap*shift_length-1, shift_length)
    DlsArr = np_zeros((num_rows, ))
    for j in vecC:
        tmpArr = np_arange(j - 1, len_each_section + j - 1)
        # tmpArr = np_arange(j, len_each_section+j)
        DlsArr[tmpArr] += wVecSq
    # DlsArrInv = 1/DlsArr
    invFT = math_sqrt(len_each_section) * np_ifft(stft_data, axis=0)
    invFT_real = invFT.real
    invFT *= wVec[:, np_newaxis, np_newaxis, np_newaxis]
    yEst = np_zeros(origSigSize)
    for index, j in enumerate(vecC):
        tmpArr = np_arange(j - 1, len_each_section + j - 1)
        yEst[tmpArr, :] += invFT_real[:, index, :]
    # sigOut = yEst * DlsArrInv[:, np_newaxis, np_newaxis]
    sigOut = yEst / DlsArr[:, np_newaxis, np_newaxis]
    return sigOut
Example #6
0
def vectorize(questions, answers, chars=None):
    """Vectorize the questions and expected answers"""
    print('Vectorization...')
    chars = chars or CHARS
    x_maxlen = max(len(question) for question in questions)
    y_maxlen = max(len(answer) for answer in answers)
#     print (len(questions), x_maxlen, len(chars))
    len_of_questions = len(questions)
    ctable = CharacterTable(chars)
    print("X = np_zeros")
    X = np_zeros((len_of_questions, x_maxlen, len(chars)), dtype=np.bool)
    print("for i, sentence in enumerate(questions):")
    for i in xrange(len(questions)):
        sentence = questions.pop()
        for j, c in enumerate(sentence):
            X[i, j, ctable.char_indices[c]] = 1
    print("y = np_zeros")
    y = np_zeros((len_of_questions, y_maxlen, len(chars)), dtype=np.bool)
    print("for i, sentence in enumerate(answers):")
    for i in xrange(len(answers)):
        sentence = answers.pop()
        for j, c in enumerate(sentence):
            y[i, j, ctable.char_indices[c]] = 1

    # Explicitly set apart 10% for validation data that we never train over
    split_at = len(X) - len(X) / 10
    (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))
    (y_train, y_val) = (y[:split_at], y[split_at:])

    print(X_train.shape)
    print(y_train.shape)

    return X_train, X_val, y_train, y_val, y_maxlen, ctable
Example #7
0
    def wlcs(self, src, tar):
        """Return the Rouge-W weighted longest common sub-sequence length.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int (may return a float if cost has float values)
            The Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = RougeW()
        >>> cmp.wlcs('cat', 'hat')
        4
        >>> cmp.wlcs('Niall', 'Neil')
        3
        >>> cmp.wlcs('aluminum', 'Catalan')
        5
        >>> cmp.wlcs('ATCG', 'TAGC')
        3

        .. versionadded:: 0.4.0

        """
        src_len = len(src)
        tar_len = len(tar)

        if src == tar:
            return self._f_func(len(src))
        if not src:
            return 0
        if not tar:
            return 0

        c_mat = np_zeros((src_len, tar_len), dtype=np_int)
        w_mat = np_zeros((src_len, tar_len), dtype=np_int)

        for i in range(src_len):
            for j in range(tar_len):
                if src[i] == tar[j]:
                    k = w_mat[i - 1, j - 1]
                    c_mat[i, j] = (c_mat[i - 1, j - 1] + self._f_func(k + 1) -
                                   self._f_func(k))
                    w_mat[i, j] = k + 1
                else:
                    if c_mat[i - 1, j] > c_mat[i, j - 1]:
                        c_mat[i, j] = c_mat[i - 1, j]
                        w_mat[i, j] = 0
                    else:
                        c_mat[i, j] = c_mat[i, j - 1]
                        w_mat[i, j] = 0

        return c_mat[src_len - 1, tar_len - 1]
Example #8
0
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
    """Return the Gotoh score of two strings.

    The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with
    affine gap penalties.

    :param str src: source string for comparison
    :param str tar: target string for comparison
    :param float gap_open: the cost of an open alignment gap (1 by default)
    :param float gap_ext: the cost of an alignment gap extension (0.4 by
        default)
    :param function sim_func: a function that returns the similarity of two
        characters (identity similarity by default)
    :returns: Gotoh score
    :rtype: float

    >>> gotoh('cat', 'hat')
    2.0
    >>> gotoh('Niall', 'Neil')
    1.0
    >>> round(gotoh('aluminum', 'Catalan'), 12)
    -0.4
    >>> gotoh('cat', 'hat')
    2.0
    """
    d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
    p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
    q_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)

    d_mat[0, 0] = 0
    p_mat[0, 0] = float('-inf')
    q_mat[0, 0] = float('-inf')
    for i in range(1, len(src) + 1):
        d_mat[i, 0] = float('-inf')
        p_mat[i, 0] = -gap_open - gap_ext * (i - 1)
        q_mat[i, 0] = float('-inf')
        q_mat[i, 1] = -gap_open
    for j in range(1, len(tar) + 1):
        d_mat[0, j] = float('-inf')
        p_mat[0, j] = float('-inf')
        p_mat[1, j] = -gap_open
        q_mat[0, j] = -gap_open - gap_ext * (j - 1)

    for i in range(1, len(src) + 1):
        for j in range(1, len(tar) + 1):
            sim_val = sim_func(src[i - 1], tar[j - 1])
            d_mat[i, j] = max(d_mat[i - 1, j - 1] + sim_val,
                              p_mat[i - 1, j - 1] + sim_val,
                              q_mat[i - 1, j - 1] + sim_val)

            p_mat[i, j] = max(d_mat[i - 1, j] - gap_open,
                              p_mat[i - 1, j] - gap_ext)

            q_mat[i, j] = max(d_mat[i, j - 1] - gap_open,
                              q_mat[i, j - 1] - gap_ext)

    i, j = (n - 1 for n in d_mat.shape)
    return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
 def __init_statistics(self):
     stats = self.raw_stats
     if stats is not None:
         combined = np_array([[int(team), stats['oprs'][team], stats['dprs'][team],
                               stats['ccwms'][team]] for team in stats['oprs'].keys()], np_object)
     else:
         teams = self.get_team()[:, 0]
         num_teams = len(teams)
         combined = np_rot90(
             np_array([teams, np_zeros(num_teams), np_zeros(num_teams), np_zeros(num_teams)], np_object))[::-1]
     self.stats = combined
    def __init_matches(self):

        for match_type, var in [['qm', 'qualification_matches'], ['qf', 'quarter_final_matches'],
                                ['sf', 'semi_final_matches'], ['f', 'final_matches']]:
            num_matches = self.__count_matches(self.raw_matches, match_type)
            if num_matches is not 0:
                # zero = range(num_matches)
                red_teams = np_zeros((num_matches,), np_object)
                blue_teams = np_zeros((num_matches,), np_object)
                blue_scores = np_zeros((num_matches,), np_object)
                red_scores = np_zeros((num_matches,), np_object)
                match_code = np_zeros((num_matches,), np_object)
                match_numbers = np_arange(1, num_matches + 1, 1)

                for match in self.raw_matches:
                    if match['comp_level'] == match_type:
                        match_num = match['match_number'] - 1

                        red_teams[match_num] = [np_int(match['alliances']['red']['teams'][0][3:]),
                                                np_int(match['alliances']['red']['teams'][1][3:]),
                                                np_int(match['alliances']['red']['teams'][2][3:])]

                        red_scores[match_num] = [-1 if match['alliances']['red']['score'] is None
                                                 else match['alliances']['red']['score'],
                                                 -1 if match['score_breakdown']['red']['auto'] is None
                                                 else match['score_breakdown']['red']['auto'],
                                                 -1 if match['score_breakdown']['red']['foul'] is None
                                                 else match['score_breakdown']['red']['foul']]

                        blue_teams[match_num] = [np_int(match['alliances']['blue']['teams'][0][3:]),
                                                 np_int(match['alliances']['blue']['teams'][1][3:]),
                                                 np_int(match['alliances']['blue']['teams'][2][3:])]

                        blue_scores[match_num] = [-1 if match['alliances']['blue']['score'] is None
                                                  else match['alliances']['blue']['score'],
                                                  -1 if match['score_breakdown']['blue']['auto'] is None
                                                  else match['score_breakdown']['blue']['auto'],
                                                  -1 if match['score_breakdown']['blue']['foul'] is None
                                                  else match['score_breakdown']['blue']['foul']]
                        match_code[match_num] = match['key']

                red_win = np_array(red_scores.tolist())[:, 0] > np_array(blue_scores.tolist())[:, 0]
                winner = np_array(['blue'] * len(red_win))
                winner[red_win] = 'red'

                self.__setattr__(var,
                                 np_rot90(np_array([[match_type] * num_matches, match_numbers, red_teams, blue_teams,
                                                    red_scores, blue_scores, winner, match_code], np_object))[::-1])
Example #11
0
def updateScore(csvfile, score):
    """ Add or update score column and reorder """
    import string
    head, rows = read_csv(csvfile)
    data = pd_read_csv(csvfile)
    data.index = data.index + 1
    cols = data.columns.tolist()
    sco = pd_Series(np_zeros(len(data[cols[0]])), index=data.index)
    if 'Score' not in cols:
        data['Score'] = sco
        cols = ['Score'] + cols
        data = data[cols]
    colk = list(string.ascii_uppercase)
    for sc in score:
        try:
            coln = colk.index(sc[0])
            val = sc[2]
            checked = sc[3]
            if checked:
                sco += val * data.iloc[:, coln]
        except:
            continue
    data['Score'] = sco
    data = data.sort_values('Score', ascending=False)
    updateMSA(os_path.dirname(csvfile), [[v] for v in data['Seq. ID']])
    data = data.reset_index(drop=True)
    data.index = data.index + 1
    data.rename_axis('Select', axis="columns")
    data.to_csv(csvfile, quoting=csv_QUOTE_ALL, index=False)
    return data
Example #12
0
def parse_matrix_part(matrix, szSub, ovSub):
    assert matrix.ndim == 3
    assert np_ndim(szSub) == 1
    assert len(szSub) == 3
    assert np_ndim(ovSub) == 1
    assert len(ovSub) == 3

    matrix_shape = np_asarray(matrix.shape, dtype=int)
    len_each_section, _, _ = szSub
    shift_length, _, _ = ovSub

    len_each_section_range = np_arange(len_each_section)

    matrix_shape = np_ceil((matrix_shape - szSub + 1)/ovSub).astype(int)
    num_rows_overlap, num_elements, num_beams = matrix_shape
    result_matrix = np_zeros((np_prod(szSub), np_prod(matrix_shape)))
    cnt = 0
    for i in range(num_beams):
        for j in range(num_elements):
            for k in range(num_rows_overlap):
                index_1 = len_each_section_range + k * shift_length
                index_2 = j
                index_3 = i
                tmp = matrix[index_1, index_2, index_3]
                result_matrix[:, cnt] = tmp
                cnt += 1

    return result_matrix
Example #13
0
def GHZ_state(n):
    r"""生成一个 GHZ-state 的 numpy 形式。

    Args:
        n (int): 量子比特数量

    Returns:
        numpy.ndarray: 一个形状为 ``(1, 2**n)`` 的 numpy 数组

    代码示例:

    .. code-block:: python

        from paddle_quantum.state import GHZ_state
        vector = GHZ_state(3)
        print(vector)

    ::

        [[0.70710678+0.j 0.        +0.j 0.        +0.j 0.        +0.j
          0.        +0.j 0.        +0.j 0.        +0.j 0.70710678+0.j]]
    """
    assert n > 2, 'qubit number must be larger than 2'
    state = np_zeros((1, 2**n))
    state[0][0] = 1 / np.sqrt(2)
    state[0][-1] = 1 / np.sqrt(2)

    return state.astype("complex128")
Example #14
0
        def _lcsstr_stl(src, tar):
            """Return start positions & length for Ratcliff-Obershelp.

            Parameters
            ----------
            src : str
                Source string for comparison
            tar : str
            Target string for comparison

            Returns
            -------
            tuple
                The start position in the source string, start position in the
                target string, and length of the longest common substring of
                strings src and tar.

            .. versionadded:: 0.1.0

            """
            lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
            longest, src_longest, tar_longest = 0, 0, 0
            for i in range(1, len(src) + 1):
                for j in range(1, len(tar) + 1):
                    if src[i - 1] == tar[j - 1]:
                        lengths[i, j] = lengths[i - 1, j - 1] + 1
                        if lengths[i, j] > longest:
                            longest = lengths[i, j]
                            src_longest = i
                            tar_longest = j
                    else:
                        lengths[i, j] = 0
            return src_longest - longest, tar_longest - longest, longest
 def get_features(self, student_ID, semester):        
     """
     """
     abs_df = self._academic_clusterer.courses_features
     tmp_df = abs_df[ abs_df[self._academic_clusterer.course_attr].isin(semester) ]
     tse_df = self._academic_clusterer.semesters_features
     tse_df = tse_df[ tse_df[self._academic_clusterer.studentId_attr]==student_ID ]
     if tse_df.empty:
         semester_lvl = 1
     else:
         semester_lvl = tse_df[self._academic_clusterer.SEMESTERS_F_LABELS[0]].values.max() + 1
     alpha = tmp_df['alpha'].values.sum()
     beta = tmp_df['beta'].values.sum()
     skewness = tmp_df['skewness'].values.sum()
     n_courses = len( semester )
     
     semester_features = (semester_lvl, alpha, beta, skewness, n_courses)
     print(semester_features)
     cs_df = self._academic_clusterer.students_features
     cs_df = cs_df[ cs_df[self._academic_clusterer.studentId_attr] == student_ID ]
     if cs_df.empty:
         student_features = np_zeros((1,5))
     else:        
         student_features = cs_df[ self._academic_clusterer.STUDENTS_F_LABELS ].values
             
     return semester_features, student_features
Example #16
0
    def noise_dwt(cls, coeff, w):
        """Return the estimation of the DWT components noise level

        coeff: DWT coefficients
        w: pywt wavelet object
        """
        n_boot = 1000
        k_th = 10
        k_std = 1. / np_sqrt(2)
        std_l = []
        std_a = np_zeros(n_boot)
        wcomp = cls.wavecomp(coeff, w, len(coeff) - 1)

        for ii in xrange(n_boot):
            std_a[ii] = np_std(bootstrap_resample(wcomp, 10))

        stdv = np_median(std_a)
        std_l.append(stdv)
        for ll in xrange(len(coeff) - 2, 0, -1):
            stdv = stdv * k_std
            std_l.append(stdv)
        std_l.append(0)

        std_l.reverse()
        return np_array(std_l) * k_th
Example #17
0
        def _lcsstr_stl(src, tar):
            """Return start positions & length for Ratcliff-Obershelp.

            Parameters
            ----------
            src : str
                Source string for comparison
            tar : str
            Target string for comparison

            Returns
            -------
            tuple
                The start position in the source string, start position in the
                target string, and length of the longest common substring of
                strings src and tar.

            .. versionadded:: 0.1.0

            """
            lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
            longest, src_longest, tar_longest = 0, 0, 0
            for i in range(1, len(src) + 1):
                for j in range(1, len(tar) + 1):
                    if src[i - 1] == tar[j - 1]:
                        lengths[i, j] = lengths[i - 1, j - 1] + 1
                        if lengths[i, j] > longest:
                            longest = lengths[i, j]
                            src_longest = i
                            tar_longest = j
                    else:
                        lengths[i, j] = 0
            return src_longest - longest, tar_longest - longest, longest
Example #18
0
def w_state(n, coeff=None):
    r"""生成一个 W-state 的 numpy 形式。

    Args:
        n (int): 量子比特数量
        coeff (numpy.ndarray, optional): 默认为 ``None`` ,即生成平均概率幅(系数)

    Returns:
        numpy.ndarray: 一个形状为 ``(1, 2**n)`` 的 numpy 数组

    代码示例:

    .. code-block:: python
    
        from paddle_quantum.state import w_state
        vector = w_state(3)
        print(vector)

    ::
    
        [[0.        +0.j 0.57735027+0.j 0.57735027+0.j 0.        +0.j
        0.57735027+0.j 0.        +0.j 0.        +0.j 0.        +0.j]]
    """
    assert n > 0, 'qubit number must be larger than 1'

    c = coeff if coeff is not None else np.ones((1, 2**n)) / np.sqrt(n)
    assert c.shape[0] == 1 and c.shape[
        1] == 2**n, 'The dimension of coeff is not right'

    state = np_zeros((1, 2**n))
    for i in range(n):
        state[0][2**i] = c[0][n - i - 1]

    return state.astype("complex128")
Example #19
0
def density_op(n):
    r"""生成密度矩阵 :math:`|00..0\rangle \langle00..0|` 的 numpy 形式。

    Args:
        n (int): 量子比特数量

    Returns:
        numpy.ndarray: 一个形状为 ``(2**n, 2**n)`` 的 numpy 数组

    代码示例:

    .. code-block:: python
    
        from paddle_quantum.state import density_op
        state = density_op(2)
        print(state)

    ::

        [[1.+0.j 0.+0.j 0.+0.j 0.+0.j]
        [0.+0.j 0.+0.j 0.+0.j 0.+0.j]
        [0.+0.j 0.+0.j 0.+0.j 0.+0.j]
        [0.+0.j 0.+0.j 0.+0.j 0.+0.j]]

    """
    assert n > 0, 'qubit number must be positive'
    rho = np_zeros((2**n, 2**n))
    rho[0, 0] = 1

    return rho.astype("complex128")
    def get_captions(self, ix, necessary_num_img_captions):
        #
        # Fetch the sequence labels
        # NOTE: 1-indexed, not 0-indexed
        first_caption_idx = self.label_start_ix[
            ix] - 1  # label_start_ix starts from 1
        last_caption_idx = self.label_end_ix[ix] - 1
        num_img_captions = last_caption_idx - first_caption_idx + 1
        assert num_img_captions > 0, f"Image {ix} has no caption. Aborting!"

        #
        # If we require more captions per image for training
        # than are available, we sample with replacement.
        if num_img_captions < necessary_num_img_captions:
            #
            seq = np_zeros([necessary_num_img_captions, self.max_seq_length],
                           dtype="int")
            for q in range(necessary_num_img_captions):
                ixl = randint(first_caption_idx, last_caption_idx)
                seq[q, :] = self.label[ixl, :self.max_seq_length]
        else:
            #
            # Unnecessary to choose the captions sequentially. Come back to this later...
            ixl = randint(first_caption_idx,
                          last_caption_idx - necessary_num_img_captions + 1)
            seq = self.label[ixl:ixl +
                             necessary_num_img_captions, :self.max_seq_length]

        return seq
Example #21
0
    def index_data(self, new_sequences: np_ndarray):
        """
        The Index_Data function allows you to insert a large number of sequences

        :param numpy.ndarray new_sequences: The sequences to be inserted

        :returns: The number of sequences (sub sequences) insert into the tree (in the trees)
        :rtype: numpy.array
        """

        # Ts Conversion to PAA
        if new_sequences.shape[-1] > 1:
            # add dim to avoid tslearn warning
            new_sequences = new_sequences.reshape(new_sequences.shape + (1, ))
        npaa = self._paa.fit_transform(new_sequences)

        # To count the number of objects in each tree
        cmpt_insert = np_zeros(shape=self.number_tree)

        for i, tree in self.forest.items():
            # Retrieves the indices of the tree, in the multi-tree case
            npaa_tmp = npaa[:, self.indices_partition[i]]
            npaa_tmp = npaa_tmp.reshape(npaa_tmp.shape[:-1])

            for npa_tp in npaa_tmp:
                tree.insert_paa(npa_tp)
                cmpt_insert[i] += 1

        # Returns array[tree_index] with the number of inserted objects for each tree
        return cmpt_insert
Example #22
0
    def number_nodes_visited(self, sub_query: np_array, ntss_tmp: np_ndarray):
        """
        Account the number of average visited nodes in the tree for calculating the approximation.

        :param numpy.array sub_query: The sequence to be evaluated
        :param numpy.ndarray ntss_tmp: Reference sequences

        :returns: Returns the number of nodes visited in the tree for the approximation *i*\ CFOF
        :rtype: numpy.array
        """
        q_paa = self.isax.transform_paa([sub_query])[0]
        ntss_tmp_paa = self.isax.transform_paa(ntss_tmp)

        distance_q_p = cdist([q_paa.reshape(q_paa.shape[:-1])],
                             ntss_tmp_paa.reshape(ntss_tmp_paa.shape[:-1]))[0]

        list_parent_node = np_zeros(len(self.node_list), dtype=np_uint32)

        for tmp_node in self.node_list:
            if tmp_node.id_numpy == 0:
                continue
            list_parent_node[tmp_node.id_numpy] = tmp_node.parent.id_numpy

        count_visited_nodes_list = nodes_visited_for_all_seq_ref(
            len(ntss_tmp_paa), distance_q_p, self.max_array, self.min_array,
            list_parent_node)

        return self.num_nodes, count_visited_nodes_list.mean()
Example #23
0
    def populateImageMaps(self):
        """Load the transformed data into the main image maps"""
        # reset these guys... JIC
        self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))
        self.im2RowIndicies = {}

        # add to the grid wherever we find a contig
        row_index = -1
        for point in np_around(self.PM.transformedCP):
            row_index += 1
            # can only bin things once!
            if row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies:
                # add to the row_index dict so we can relate the
                # map back to individual points later
                p = tuple(point)
                if p in self.im2RowIndicies:
                    self.im2RowIndicies[p].append(row_index)
                else:
                    self.im2RowIndicies[p] = [row_index]

                # now increment in the grid
                # for each point we encounter we incrmement
                # it's position + the positions to each side
                # and touching each corner
                self.incrementViaRowIndex(row_index, p)
Example #24
0
    def dist_abs(self, src, tar):
        """Return the FlexMetric distance of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            FlexMetric distance

        Examples
        --------
        >>> cmp = FlexMetric()
        >>> cmp.dist_abs('cat', 'hat')
        0.8
        >>> cmp.dist_abs('Niall', 'Neil')
        1.5
        >>> cmp.dist_abs('aluminum', 'Catalan')
        6.7
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2.1999999999999997


        .. versionadded:: 0.4.0

        """
        src_len = len(src)
        tar_len = len(tar)

        if src == tar:
            return 0
        if not src:
            return sum(self._cost('', -1, tar, j) for j in range(len(tar)))
        if not tar:
            return sum(self._cost(src, i, '', -1) for i in range(len(src)))

        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
        for i in range(1, src_len + 1):
            d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(src, i - 1, '', -1)
        for j in range(1, tar_len + 1):
            d_mat[0, j] = d_mat[0, j - 1] + self._cost('', -1, tar, j - 1)

        src_lc = src.lower()
        tar_lc = tar.lower()

        for i in range(src_len):
            for j in range(tar_len):
                d_mat[i + 1, j + 1] = min(
                    d_mat[i + 1, j] + self._cost('', -1, tar_lc, j),  # ins
                    d_mat[i, j + 1] + self._cost(src_lc, i, '', -1),  # del
                    d_mat[i, j] + (self._cost(src_lc, i, tar_lc, j)
                                   if src[i] != tar[j] else 0),  # sub/==
                )

        return d_mat[src_len, tar_len]
Example #25
0
def optimize_one_day(
    i: int,
    op_ct_func: Callable,
    pw_mat: np_ndarray,
    o_cub: np_ndarray,
    rsk_tgt_mat: np_ndarray,
    rsk_msk: np_ndarray,
    hg_tools: Tuple = (-1, )) -> np_ndarray:

    port_mat = o_cub[:, i, :]
    pw_ar = pw_mat[:, i, :]
    rsk_tgt = rsk_tgt_mat[i, :]

    hg_tools = list(hg_tools)
    rsk_msk = list(rsk_msk)
    pw_ar = pw_ar.reshape(-1)

    def _op_tgt(w: np_ndarray) -> float:
        tw = pw_ar.copy()
        tw[hg_tools] = w
        return op_ct_func(port_mat.T @ tw, rsk_tgt=rsk_tgt, rsk_msk=rsk_msk)

    w_0 = np_rd.randn(len(hg_tools))

    tr = np_zeros(port_mat.shape[0], dtype=float)
    tr[hg_tools] = minimize(_op_tgt, w_0, method="nelder-mead").x
    return tr
Example #26
0
def node_comparison_prec_recall(known_complex_nodes_list, fin_list_graphs,
                                N_pred_comp, N_test_comp, p, out_comp_nm):
    N_matches_test = 0

    Metric = np_zeros((N_test_comp, N_pred_comp))

    for i, test_complex in enumerate(known_complex_nodes_list):
        N_match_pred = 0
        for j, pred_complex in enumerate(fin_list_graphs):
            T = set(test_complex)
            P = pred_complex[0]
            C = len(T.intersection(P))
            A = len(P.difference(T))
            B = len(T.difference(P))

            if float(C) / (A + C) > p and float(C) / (B + C) > p:
                Metric[i, j] = 1
                N_match_pred = N_match_pred + 1

        if N_match_pred > 0:
            N_matches_test = N_matches_test + 1

    plot_pr_curve_orig(Metric, fin_list_graphs, out_comp_nm)

    Recall = float(N_matches_test) / N_test_comp

    N_matches_pred = np_count_nonzero(np_sum(Metric, axis=0))
    Precision = float(N_matches_pred) / N_pred_comp

    if Precision == Recall == 0:
        F1_score = 0
    else:
        F1_score = 2 * Precision * Recall / (Precision + Recall)

    return Precision, Recall, F1_score
Example #27
0
def score_by_listvrang(k_list_result, k_rho):
    """
    CFOF approximations computation from the vrang list and according to the value of :math:`\\varrho` contained in the ``k_rho`` list.
    CFOF approximation obtained for each :math:`\\varrho` contained in the list``k_rho``.

    :param list(float) k_list_result: The list of vrang of the sequence to be evaluated
    :param list(float) k_rho: The list of :math:`\\varrho` for CFOF score approximations computation.

    :returns: The list of CFOF score approximations
    :rtype: list(float)
    """

    nb_obj_total = len(k_list_result)

    score_list = np_zeros(len(k_rho))
    need_nn_prec = 0
    for k_rho_ite, k_rho_var in enumerate(k_rho):

        need_nn = k_rho_var - need_nn_prec

        while need_nn > 0:
            need_nn -= 1
            estim_final = k_list_result.pop(0)

        score_list[k_rho_ite] = estim_final / nb_obj_total
        need_nn_prec = k_rho_var

    return score_list
Example #28
0
    def _plot(self, x, y=None):

        cr = True
        if y == None:
            cr = False
            y = x

        length = x.shape[0]
        rplot = np_zeros((length, length))

        if cr:
            np_fill_diagonal(rplot, self.norm.compute(x, y))

        if self.norm.is_simmetric:
            for lag in xrange(1, length):
                d = self.norm.compute(x[0:-lag], y[lag:])
                np_fill_diagonal(rplot[lag:, 0:-lag], d)
                np_fill_diagonal(rplot[0:-lag, lag:], d)

#            rplot = np_rot90(rplot)

        else:
            pass

        return rplot
Example #29
0
    def set_data(self):
        self.x_mat = np_zeros((self.col_mat.shape[1], self.col_mat.shape[0]))

        self.x_mat[:, :] = self.x
        self.tex = pg_makeRGBA(np_rot90(self.col_mat),
                               levels=(50., 255.))[0] / 255.
        self.tex[..., 3] = self.tex[..., 0]
Example #30
0
def vrang_list_for_all_seq_ref(len_seq_list, distance, max_array, min_array,
                               cdf_mean, cdf_std, num_ts_by_node,
                               index_cdf_bin, cdf_bins):
    """
    Uses the function :func:`~pyCFOFiSAX.tree_iSAX.vrang_seq_ref` For each reference sequence.

    :param float len_seq_list: The number of reference sequence
    :param np_array distance: The distance between the two sequences
    :param np_ndarray max_array: Max distances between the nodes of the tree and the reference sequence
    :param np_ndarray min_array: MIN distances between the nodes of the tree and the reference sequence
    :param np_ndarray cdf_mean: The average distances between the nodes of the tree and the reference sequence
    :param np_array cdf_std: Dispersion of distances in each leaf node
    :param np_array num_ts_by_node: The number of sequence in each node sheet
    :param np_array index_cdf_bin: The index of the CDF ``cdf_bins``
    :param np_array cdf_bins: Normal distribution cdf values centered at the origin and standard deviation

    :returns: la liste des vrang
    :rtype: np_array
    """
    vrang_array = np_zeros(len_seq_list)
    for ii_tmp in prange(len_seq_list):
        vrang_array[ii_tmp] = vrang_seq_ref(
            distance[ii_tmp], max_array[ii_tmp], min_array[ii_tmp],
            cdf_mean[ii_tmp], cdf_std, num_ts_by_node, index_cdf_bin, cdf_bins)
    return vrang_array
Example #31
0
    def lcsseq(self, src, tar):
        """Return the longest common subsequence of two strings.

        Based on the dynamic programming algorithm from
        http://rosettacode.org/wiki/Longest_common_subsequence
        :cite:`rosettacode:2018b`. This is licensed GFDL 1.2.

        Modifications include:
            conversion to a numpy array in place of a list of lists

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        str
            The longest common subsequence

        Examples
        --------
        >>> sseq = LCSseq()
        >>> sseq.lcsseq('cat', 'hat')
        'at'
        >>> sseq.lcsseq('Niall', 'Neil')
        'Nil'
        >>> sseq.lcsseq('aluminum', 'Catalan')
        'aln'
        >>> sseq.lcsseq('ATCG', 'TAGC')
        'AC'

        """
        lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)

        # row 0 and column 0 are initialized to 0 already
        for i, src_char in enumerate(src):
            for j, tar_char in enumerate(tar):
                if src_char == tar_char:
                    lengths[i + 1, j + 1] = lengths[i, j] + 1
                else:
                    lengths[i + 1, j + 1] = max(lengths[i + 1, j],
                                                lengths[i, j + 1])

        # read the substring out from the matrix
        result = ''
        i, j = len(src), len(tar)
        while i != 0 and j != 0:
            if lengths[i, j] == lengths[i - 1, j]:
                i -= 1
            elif lengths[i, j] == lengths[i, j - 1]:
                j -= 1
            else:
                result = src[i - 1] + result
                i -= 1
                j -= 1
        return result
Example #32
0
    def sim(self, src, tar):
        """Return the BI-SIM similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            BI-SIM similarity

        Examples
        --------
        >>> cmp = BISIM()
        >>> cmp.sim('cat', 'hat')
        0.5
        >>> cmp.sim('Niall', 'Neil')
        0.4
        >>> cmp.sim('aluminum', 'Catalan')
        0.3125
        >>> cmp.sim('ATCG', 'TAGC')
        0.375


        .. versionadded:: 0.4.0

        """
        src_len = len(src)
        tar_len = len(tar)

        if src == tar:
            return 1.0
        if not src or not tar:
            return 0.0

        def _id(src_pos, tar_pos):
            s = 0
            for i in range(self._qval):
                s += int(src[src_pos + i] == tar[tar_pos + i])
            return s / self._qval

        src = src[0].swapcase() * (self._qval - 1) + src
        tar = tar[0].swapcase() * (self._qval - 1) + tar

        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)

        for i in range(1, src_len + 1):
            for j in range(1, tar_len + 1):
                d_mat[i, j] = max(
                    d_mat[i - 1, j - 1] + _id(i - 1, j - 1),  # sub/==
                    d_mat[i - 1, j],  # ins
                    d_mat[i, j - 1],  # del
                )
        return d_mat[src_len, tar_len] / max(src_len, tar_len)
Example #33
0
    def sim(self, src, tar):
        """Return the BI-SIM similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            BI-SIM similarity

        Examples
        --------
        >>> cmp = BISIM()
        >>> cmp.sim('cat', 'hat')
        0.5
        >>> cmp.sim('Niall', 'Neil')
        0.4
        >>> cmp.sim('aluminum', 'Catalan')
        0.3125
        >>> cmp.sim('ATCG', 'TAGC')
        0.375


        .. versionadded:: 0.4.0

        """
        src_len = len(src)
        tar_len = len(tar)

        if src == tar:
            return 1.0
        if not src or not tar:
            return 0.0

        def _id(src_pos, tar_pos):
            s = 0
            for i in range(self._qval):
                s += int(src[src_pos + i] == tar[tar_pos + i])
            return s / self._qval

        src = src[0].swapcase() * (self._qval - 1) + src
        tar = tar[0].swapcase() * (self._qval - 1) + tar

        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)

        for i in range(1, src_len + 1):
            for j in range(1, tar_len + 1):
                d_mat[i, j] = max(
                    d_mat[i - 1, j - 1] + _id(i - 1, j - 1),  # sub/==
                    d_mat[i - 1, j],  # ins
                    d_mat[i, j - 1],  # del
                )
        return d_mat[src_len, tar_len] / max(src_len, tar_len)
 def __getitem__(self, index):
     """This function returns a tuple that is further passed to collate_fn
     """
     ix, it_pos_now, wrapped = index  # self.split_ix[index]
     if self.use_att:
         att_feat = self.att_loader.get(str(self.info["images"][ix]["id"]))
         # Reshape to K x C
         att_feat = att_feat.reshape(-1, att_feat.shape[-1])
         if self.norm_att_feat:
             att_feat = att_feat / np_norm(att_feat, 2, 1, keepdims=True)
         if self.use_box:
             box_feat = self.box_loader.get(
                 str(self.info["images"][ix]["id"]))
             # devided by image width and height
             x1, y1, x2, y2 = np_hsplit(box_feat, 4)
             h, w = (
                 self.info["images"][ix]["height"],
                 self.info["images"][ix]["width"],
             )
             box_feat = np_hstack(
                 (x1 / w, y1 / h, x2 / w, y2 / h,
                  (x2 - x1) * (y2 - y1) / (w * h)))  # question? x2-x1+1??
             if self.norm_box_feat:
                 box_feat = box_feat / np_norm(
                     box_feat, 2, 1, keepdims=True)
             att_feat = np_hstack([att_feat, box_feat])
             # sort the features by the size of boxes
             att_feat = np_stack(
                 sorted(att_feat, key=lambda x: x[-1], reverse=True))
     else:
         att_feat = np_zeros((0, 0), dtype="float32")
     if self.use_fc:
         try:
             fc_feat = self.fc_loader.get(str(
                 self.info["images"][ix]["id"]))
         except:
             # Use average of attention when there is no fc provided (For bottomup feature)
             fc_feat = att_feat.mean(0)
     else:
         fc_feat = np_zeros((0), dtype="float32")
     if hasattr(self, "h5_label_file"):
         seq = self.get_captions(ix, self.necessary_num_img_captions)
     else:
         seq = None
     return (fc_feat, att_feat, seq, ix, it_pos_now, wrapped)
Example #35
0
    def vector(self, sentence):
        v = np_zeros(len(self.vocab), dtype=int)

        for word in sentence.split(' '):
            for i, _word in enumerate(self.vocab):
                if _word == word:
                    # print(_word)
                    v[i] = 1
        return v
Example #36
0
 def _vectorize_text(self, text):
     vectors = []
     for word in text.split():
         # if there's no word2vec vector for this word, put in a vec of all 0
         try:
             vectors.append(self.word_vectors.word_vec(word))
         except:
             vectors.append(np_zeros(self.word_vec_size))
     return vectors
 def __init_statistics(self):
     stats = self.raw_stats
     if stats is not None:
         combined = np_array([[
             int(team), stats['oprs'][team], stats['dprs'][team],
             stats['ccwms'][team]
         ] for team in stats['oprs'].keys()], np_object)
     else:
         teams = self.get_team()[:, 0]
         num_teams = len(teams)
         combined = np_rot90(
             np_array([
                 teams,
                 np_zeros(num_teams),
                 np_zeros(num_teams),
                 np_zeros(num_teams)
             ], np_object))[::-1]
     self.stats = combined
Example #38
0
    def lcsstr(self, src, tar):
        """Return the longest common substring of two strings.

        Longest common substring (LCSstr).

        Based on the code from
        https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring
        :cite:`Wikibooks:2018`.
        This is licensed Creative Commons: Attribution-ShareAlike 3.0.

        Modifications include:

            - conversion to a numpy array in place of a list of lists
            - conversion to Python 2/3-safe range from xrange via six

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        str
            The longest common substring

        Examples
        --------
        >>> sstr = LCSstr()
        >>> sstr.lcsstr('cat', 'hat')
        'at'
        >>> sstr.lcsstr('Niall', 'Neil')
        'N'
        >>> sstr.lcsstr('aluminum', 'Catalan')
        'al'
        >>> sstr.lcsstr('ATCG', 'TAGC')
        'A'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
        longest, i_longest = 0, 0
        for i in range(1, len(src) + 1):
            for j in range(1, len(tar) + 1):
                if src[i - 1] == tar[j - 1]:
                    lengths[i, j] = lengths[i - 1, j - 1] + 1
                    if lengths[i, j] > longest:
                        longest = lengths[i, j]
                        i_longest = i
                else:
                    lengths[i, j] = 0
        return src[i_longest - longest : i_longest]
Example #39
0
    def _coding_mask(self, seq_id):
        """Build mask indicating which bases in a sequences are coding."""

        # safe way to calculate coding bases as it accounts
        # for the potential of overlapping genes
        coding_mask = np_zeros(self.last_coding_base[seq_id])
        for pos in self.genes[seq_id].values():
            coding_mask[pos[0]:pos[1] + 1] = 1

        return coding_mask
Example #40
0
    def sim_score(self, src, tar):
        """Return the SAPS similarity between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int
            The SAPS similarity between src & tar

        Examples
        --------
        >>> cmp = SAPS()
        >>> cmp.sim_score('cat', 'hat')
        0
        >>> cmp.sim_score('Niall', 'Neil')
        3
        >>> cmp.sim_score('aluminum', 'Catalan')
        -11
        >>> cmp.sim_score('ATCG', 'TAGC')
        -1
        >>> cmp.sim_score('Stevenson', 'Stinson')
        16


        .. versionadded:: 0.4.0

        """
        src = self._tokenizer.tokenize(src).get_list()
        tar = self._tokenizer.tokenize(tar).get_list()

        src = ''.join([_[0].upper() + _[1:].lower() for _ in src])
        tar = ''.join([_[0].upper() + _[1:].lower() for _ in tar])

        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
        for i in range(len(src)):
            d_mat[i + 1, 0] = d_mat[i, 0] + self._g(src[i])
        for j in range(len(tar)):
            d_mat[0, j + 1] = d_mat[0, j] + self._g(tar[j])

        for i in range(len(src)):
            for j in range(len(tar)):
                d_mat[i + 1, j + 1] = max(
                    d_mat[i, j + 1] + self._g(src[i]),  # ins
                    d_mat[i + 1, j] + self._g(tar[j]),  # del
                    d_mat[i, j] + self._s(src[i], tar[j]),  # sub/==
                )

        return d_mat[len(src), len(tar)]
    def add_reference_surface(self, xmin, xmax, ymin, ymax, image):
        cx = np_linspace(xmin/self.kx,xmax/self.kx,image.shape[1])
        cy = np_linspace(ymin/self.ky,ymax/self.ky,image.shape[0])
        cz = np_zeros((image.shape[1],image.shape[0]))


        ref_tex = pg_makeRGBA(np_rot90(image, k=3))[0]/255.
        self.ref_surf = gl.GLSurfacePlotItem(x=cx, y=cy, z=cz, colors = ref_tex, shader='balloon')
        self.ref_surf.translate(-self.xoff,-self.yoff,self.zoff)
        self.addItem(self.ref_surf)

        return self.ref_surf
Example #42
0
    def transformCP(self, silent=False, nolog=False, min=None, max=None):
        """Do the main ransformation on the coverage profile data"""
        shrinkFn = np_log10
        if(nolog):
            shrinkFn = lambda x:x
         
        s = (self.numContigs,3)
        self.transformedCP = np_zeros(s)

        if(not silent):
            print "    Dimensionality reduction"

        # get the median distance from the origin
        unit_vectors = [(np_cos(i*2*np_pi/self.numStoits),np_sin(i*2*np_pi/self.numStoits)) for i in range(self.numStoits)]
        for i in range(len(self.indices)):
            norm = np_norm(self.covProfiles[i])
            if(norm != 0):
                radial = shrinkFn(norm)
            else:
                radial = norm
            shifted_vector = np_array([0.0,0.0])
            flat_vector = (self.covProfiles[i] / sum(self.covProfiles[i]))
            
            for j in range(self.numStoits):
                shifted_vector[0] += unit_vectors[j][0] * flat_vector[j]
                shifted_vector[1] += unit_vectors[j][1] * flat_vector[j]

            # log scale it towards the centre
            scaling_vector = shifted_vector * self.scaleFactor
            sv_size = np_norm(scaling_vector)
            if(sv_size > 1):
                shifted_vector /= shrinkFn(sv_size)

            self.transformedCP[i,0] = shifted_vector[0]
            self.transformedCP[i,1] = shifted_vector[1]
            self.transformedCP[i,2] = radial

        if(not silent):
            print "    Reticulating splines"
            
        # finally scale the matrix to make it equal in all dimensions
        if(min is None):                
            min = np_amin(self.transformedCP, axis=0)
            max = np_amax(self.transformedCP, axis=0)
            max = max - min
            max = max / (self.scaleFactor-1)

        for i in range(0,3):
            self.transformedCP[:,i] = (self.transformedCP[:,i] -  min[i])/max[i]

        return(min,max)
Example #43
0
    def _parse_data(self, infile):
        data = {}
        with open(infile) as fp:
            fp.readline()
            genomes = set()
            for line in fp:
                fields = line.rstrip().split('\t')
                fields[0] = re.sub(r'_genes$', "", fields[0])
                fields[2] = re.sub(r'_genes$', "", fields[2])
                genomes.add(fields[0])
                genomes.add(fields[2])
                try:
                    data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])]
                except KeyError:
                    data[fields[0]] = {}
                    data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])]
                except IndexError as e:
                    print(fields)
                    raise e

        self.perc_ids = np_zeros([len(genomes), len(genomes)])
        self.perc_aln = np_zeros([len(genomes), len(genomes)])
        genome_to_index = {}
        self.genomes = [None] * len(genomes)
        for n, g in enumerate(alphanumeric_sort(genomes)):
            genome_to_index[g] = n
            self.genomes[n] = g

        self.genomes = np_array(self.genomes)
        for g1, g2 in permutations(genomes, 2):
            try:
                self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g1][g2][0]
                self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g1][g2][1]
            except:
                self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g2][g1][0]
                self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g2][g1][1]
Example #44
0
    def __init__(self, dbFileName, plot=False, force=False, numImgMaps=1):
        # worker classes
        self.PM = ProfileManager(dbFileName)  # store our data
        self.BM = BinManager(pm=self.PM)  # store our bins

        # heat maps
        self.numImgMaps = numImgMaps
        self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))
        self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))

        # we need a way to reference from the imageMaps back onto the transformed data
        self.im2RowIndicies = {}

        # When blurring the raw image maps I chose a radius to suit my data, you can vary this as you like
        self.blurRadius = 2
        self.span = 30  # amount we can travel about when determining "hot spots"

        # misc
        self.minSize = 10  # Min number of contigs for a bin to be considered legit
        self.minVol = 1000000  # Override on the min size, if we have this many BP
        self.forceWriting = force
        self.debugPlots = plot
        self.imageCounter = 1  # when we print many images
        self.roundNumber = 0  # how many times have we tried to make a bin?
def array2PETScVec(v):
    """
    Converts (copies) a sequential array/vector on process 0
    to a distributed PETSc Vec
    input : v, numpy array on proc 0, None (or whatever) on other proc
    output: PETSc Vec distributed on all procs
    """

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    # v is (probably) only redefined on proc 0
    if rank == 0:
        n = len(v)
    else:
        n = None

    n = comm.bcast(n, root = 0)
    #print "DEBUG", __name__, "rank=", rank, "n=", n

    x = PETSc.Vec()
    x.create(comm)
    x.setSizes(n)
    x.setFromOptions()
    istart,iend = x.getOwnershipRange()

    nloc = iend - istart
    Istart = comm.gather(istart,root = 0)
    Iend   = comm.gather(iend  ,root = 0)

    vloc = np_zeros(nloc,PETSc.ScalarType)

    if rank == 0:
        vloc[:nloc  ] = v[:nloc]

    for iproc in range(1,comm.size):
        if rank == 0:
            i0        = Istart[iproc]
            i1        = Iend  [iproc]
            comm.Send(v[i0:i1], dest=iproc, tag=77)
        elif rank == iproc:
            comm.Recv(vloc, source=0, tag=77)

    x.setArray(vloc)

    return x
def PETScVec2array(x):
    """
    Converts (copies) a distributed PETSc Vec to a sequential array on process 0
    input : x, PETSc Vec distributed on all procs
    output: numpy array on proc 0
    """

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    vloc = x.getArray()
    n    = x.getSize()

    istart,iend = x.getOwnershipRange()

    nloc = iend - istart
    Istart = comm.gather(istart,root = 0)
    Iend   = comm.gather(iend  ,root = 0)

    if rank == 0:
        v = np_zeros(n,PETSc.ScalarType)
    else:
        v = None

    if rank == 0:
        v[:nloc  ] = vloc

    for iproc in range(1,comm.size):
        if rank == 0:
            i0        = Istart[iproc]
            i1        = Iend  [iproc]
            comm.Recv(v[i0:i1], source=iproc, tag=77)
        elif rank == iproc:
            comm.Send(vloc, dest=0, tag=77)

    return v
    def __init__(self,
                 orbit,
                 orbit_dict,
                 q_rects = None,
                 roi_movable = False,
                 lock_aspect = True,
                 parent = None,
                 labels = 1,
                 x_label = 'x',
                 y_label = 'y',
                 x_unit = "",
                 y_unit = "",
                 v_offset = (0,0),
                 prefs = None,
                 depth_meas = True,
                 iface = None):

        super(OrbitViewer, self).__init__(parent)

        self.plots = []
        data_f = []
        sim_f = []
        self.v_offset = v_offset
        self.v_offset_data = self.v_offset[0]
        self.v_offset_sim = self.v_offset[1]
        self.orbit_label = orbit_dict.get_instrument() + " - Orbit "+str(orbit)
        self.x_unit = x_unit
        self.y_unit = y_unit
        self.orbit_dict=orbit_dict
        self.prefs = prefs
        self.iface = iface

        if orbit_dict.data:
            for band in orbit_dict.data:
                data_f.append(np_mean(band,0))

        else:
            for band in orbit_dict.sim:
                data_f.append(np_zeros(band.shape[1:]))


        if orbit_dict.sim:
            for band in orbit_dict.sim:
                sim_f.append(np_mean(band,0))

        else:
            for band in orbit_dict.data:
                sim_f.append(np_zeros(band.shape[1:]))

        ii = 0
        for band in orbit_dict.data:
            depth_cb = CreateDepthLayer(self.orbit_dict, ii, QgsProject.instance().readPath("./"), self.iface)
            self.plots.append(SinglePlot(images = [data_f[ii], sim_f[ii]],
                                         images_label = ["data", "sim"],
                                         label_text = self.orbit_label+" Frequency band "+str(ii+1),
                                         q_rects = q_rects,
                                         roi_movable = roi_movable,
                                         lock_aspect = lock_aspect,
                                         x_label = x_label,
                                         y_label = y_label,
                                         x_unit = x_unit,
                                         y_unit = y_unit,
                                         depth_cb = depth_cb.run,
                                         depth_meas = depth_meas))

            self.addItem(self.plots[-1], row=0, col=(ii))

            ii = ii + 1

        self.set_pos_label(0)
Example #48
0
    def dist_abs(self, src, tar):
        """Return the Gotoh score of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Gotoh score

        Examples
        --------
        >>> cmp = Gotoh()
        >>> cmp.dist_abs('cat', 'hat')
        2.0
        >>> cmp.dist_abs('Niall', 'Neil')
        1.0
        >>> round(cmp.dist_abs('aluminum', 'Catalan'), 12)
        -0.4
        >>> cmp.dist_abs('cat', 'hat')
        2.0


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
        p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
        q_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)

        d_mat[0, 0] = 0
        p_mat[0, 0] = float('-inf')
        q_mat[0, 0] = float('-inf')
        for i in range(1, len(src) + 1):
            d_mat[i, 0] = float('-inf')
            p_mat[i, 0] = -self._gap_open - self._gap_ext * (i - 1)
            q_mat[i, 0] = float('-inf')
            q_mat[i, 1] = -self._gap_open
        for j in range(1, len(tar) + 1):
            d_mat[0, j] = float('-inf')
            p_mat[0, j] = float('-inf')
            p_mat[1, j] = -self._gap_open
            q_mat[0, j] = -self._gap_open - self._gap_ext * (j - 1)

        for i in range(1, len(src) + 1):
            for j in range(1, len(tar) + 1):
                sim_val = self._sim_func(src[i - 1], tar[j - 1])
                d_mat[i, j] = max(
                    d_mat[i - 1, j - 1] + sim_val,
                    p_mat[i - 1, j - 1] + sim_val,
                    q_mat[i - 1, j - 1] + sim_val,
                )

                p_mat[i, j] = max(
                    d_mat[i - 1, j] - self._gap_open,
                    p_mat[i - 1, j] - self._gap_ext,
                )

                q_mat[i, j] = max(
                    d_mat[i, j - 1] - self._gap_open,
                    q_mat[i, j - 1] - self._gap_ext,
                )

        i, j = (n - 1 for n in d_mat.shape)
        return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
Example #49
0
    def loadData(self,
                 timer,
                 condition,                 # condition as set by another function
                 bids=[],                   # if this is set then only load those contigs with these bin ids
                 verbose=True,              # many to some output messages
                 silent=False,              # some to no output messages
                 loadCovProfiles=True,
                 loadKmerPCs=True,
                 loadKmerVarPC=True,
                 loadRawKmers=False,
                 makeColors=True,
                 loadContigNames=True,
                 loadContigLengths=True,
                 loadContigGCs=True,
                 loadBins=False,
                 loadLinks=False):
        """Load pre-parsed data"""

        timer.getTimeStamp()
        if(silent):
            verbose=False
        if verbose:
            print "Loading data from:", self.dbFileName

        try:
            self.numStoits = self.getNumStoits()
            self.condition = condition
            self.indices = self.dataManager.getConditionalIndices(self.dbFileName,
                                                                  condition=condition,
                                                                  silent=silent)
            if(verbose):
                print "    Loaded indices with condition:", condition
            self.numContigs = len(self.indices)

            if self.numContigs == 0:
                print "    ERROR: No contigs loaded using condition:", condition
                return

            if(not silent):
                print "    Working with: %d contigs" % self.numContigs

            if(loadCovProfiles):
                if(verbose):
                    print "    Loading coverage profiles"
                self.covProfiles = self.dataManager.getCoverageProfiles(self.dbFileName, indices=self.indices)
                self.normCoverages = self.dataManager.getNormalisedCoverageProfiles(self.dbFileName, indices=self.indices)

                # work out average coverages
                self.averageCoverages = np_array([sum(i)/self.numStoits for i in self.covProfiles])

            if loadRawKmers:
                if(verbose):
                    print "    Loading RAW kmer sigs"
                self.kmerSigs = self.dataManager.getKmerSigs(self.dbFileName, indices=self.indices)

            if(loadKmerPCs):
                self.kmerPCs = self.dataManager.getKmerPCAs(self.dbFileName, indices=self.indices)

                if(verbose):
                    print "    Loading PCA kmer sigs (" + str(len(self.kmerPCs[0])) + " dimensional space)"

                self.kmerNormPC1 = np_copy(self.kmerPCs[:,0])
                self.kmerNormPC1 -= np_min(self.kmerNormPC1)
                self.kmerNormPC1 /= np_max(self.kmerNormPC1)

            if(loadKmerVarPC):
                self.kmerVarPC = self.dataManager.getKmerVarPC(self.dbFileName, indices=self.indices)

                if(verbose):
                    print "    Loading PCA kmer variance (total variance: %.2f" % np_sum(self.kmerVarPC) + ")"

            if(loadContigNames):
                if(verbose):
                    print "    Loading contig names"
                self.contigNames = self.dataManager.getContigNames(self.dbFileName, indices=self.indices)

            if(loadContigLengths):
                self.contigLengths = self.dataManager.getContigLengths(self.dbFileName, indices=self.indices)
                if(verbose):
                    print "    Loading contig lengths (Total: %d BP)" % ( sum(self.contigLengths) )

            if(loadContigGCs):
                self.contigGCs = self.dataManager.getContigGCs(self.dbFileName, indices=self.indices)
                if(verbose):
                    print "    Loading contig GC ratios (Average GC: %0.3f)" % ( np_mean(self.contigGCs) )

            if(makeColors):
                if(verbose):
                    print "    Creating color map"

                # use HSV to RGB to generate colors
                S = 1       # SAT and VAL remain fixed at 1. Reduce to make
                V = 1       # Pastels if that's your preference...
                self.colorMapGC = self.createColorMapHSV()

            if(loadBins):
                if(verbose):
                    print "    Loading bin assignments"

                self.binIds = self.dataManager.getBins(self.dbFileName, indices=self.indices)

                if len(bids) != 0: # need to make sure we're not restricted in terms of bins
                    bin_stats = self.getBinStats()
                    for bid in bids:
                        try:
                            self.validBinIds[bid] = bin_stats[bid][0]
                            self.isLikelyChimeric[bid]= bin_stats[bid][1]
                        except KeyError:
                            self.validBinIds[bid] = 0
                            self.isLikelyChimeric[bid]= False

                else:
                    bin_stats = self.getBinStats()
                    for bid in bin_stats:
                        self.validBinIds[bid] = bin_stats[bid][0]
                        self.isLikelyChimeric[bid] = bin_stats[bid][1]

                # fix the binned indices
                self.binnedRowIndices = {}
                for i in range(len(self.indices)):
                    if(self.binIds[i] != 0):
                        self.binnedRowIndices[i] = True
            else:
                # we need zeros as bin indicies then...
                self.binIds = np_zeros(len(self.indices))

            if(loadLinks):
                self.loadLinks()

            self.stoitColNames = self.getStoitColNames()

        except:
            print "Error loading DB:", self.dbFileName, exc_info()[0]
            raise
Example #50
0
    def _group_linkage_intersection(self):
        r"""Return the group linkage intersection of the tokens in src and tar.

        This is based on group linkage, as defined by :cite:`On:2007`.

        Most of this method is concerned with solving the assignment problem,
        in order to find the weight of the maximum weight bipartite matching.
        If the system has SciPy installed, we use it's linear_sum_assignment
        function to get the assignments. Otherwise, we use the Hungarian
        algorithm of Munkres :cite:`Munkres:1957`, implemented in Python &
        Numpy.

        .. versionadded:: 0.4.0

        """
        intersection = self._crisp_intersection()
        src_only = sorted(self._src_tokens - self._tar_tokens)
        tar_only = sorted(self._tar_tokens - self._src_tokens)

        if linear_sum_assignment and not (
            'internal_assignment_problem' in self.params
            and self.params['internal_assignment_problem']
        ):
            arr = np_zeros((len(tar_only), len(src_only)))

            for col in range(len(src_only)):
                for row in range(len(tar_only)):
                    arr[row, col] = self.params['metric'].dist(
                        src_only[col], tar_only[row]
                    )

            for row, col in zip(*linear_sum_assignment(arr)):
                sim = 1.0 - arr[row, col]
                if sim >= self.params['threshold']:
                    intersection[src_only[col]] += (sim / 2) * (
                        self._src_tokens - self._tar_tokens
                    )[src_only[col]]
                    intersection[tar_only[row]] += (sim / 2) * (
                        self._tar_tokens - self._src_tokens
                    )[tar_only[row]]
        else:
            n = max(len(tar_only), len(src_only))
            arr = np_zeros((n, n), dtype=float)

            for col in range(len(src_only)):
                for row in range(len(tar_only)):
                    arr[row, col] = self.params['metric'].dist(
                        src_only[col], tar_only[row]
                    )

            src_only += [''] * (n - len(src_only))
            tar_only += [''] * (n - len(tar_only))

            orig_sim = 1 - np_copy(arr)

            # Step 1
            for row in range(n):
                arr[row, :] -= arr[row, :].min()
            # Step 2
            for col in range(n):
                arr[:, col] -= arr[:, col].min()

            while True:
                # Step 3
                assignments = {}

                allocated_cols = set()
                allocated_rows = set()
                assigned_rows = set()
                assigned_cols = set()

                for row in range(n):
                    if (arr[row, :] == 0.0).sum() == 1:
                        col = arr[row, :].argmin()
                        if col not in allocated_cols:
                            assignments[row, col] = orig_sim[row, col]
                            allocated_cols.add(col)
                            assigned_rows.add(row)
                            assigned_cols.add(col)

                for col in range(n):
                    if (arr[:, col] == 0.0).sum() == 1:
                        row = arr[:, col].argmin()
                        if row not in allocated_rows:
                            assignments[row, col] = orig_sim[row, col]
                            allocated_rows.add(row)
                            assigned_rows.add(row)
                            assigned_cols.add(col)

                if len(assignments) == n:
                    break

                marked_rows = {_ for _ in range(n) if _ not in assigned_rows}
                marked_cols = set()
                for row in sorted(set(marked_rows)):
                    for col, mark in enumerate(arr[row, :] == 0.0):
                        if mark:
                            marked_cols.add(col)
                            for row2 in range(n):
                                if (row2, col) in assignments:
                                    marked_rows.add(row2)

                if n - len(marked_rows) + len(marked_cols) == n:
                    # We have sufficient lines
                    for col in range(n):
                        row = arr[:, col].argmin()
                        assignments[row, col] = orig_sim[row, col]
                    break

                # Step 4
                min_val = arr[tuple(marked_rows), :][
                    :, sorted(set(range(n)) - marked_cols)
                ].min()
                for row in range(n):
                    for col in range(n):
                        if row in marked_rows and col not in marked_cols:
                            arr[row, col] -= min_val
                        elif row not in marked_rows and col in marked_cols:
                            arr[row, col] += min_val

            for row, col in assignments.keys():
                sim = orig_sim[row, col]
                if sim >= self.params['threshold']:
                    intersection[src_only[col]] += (sim / 2) * (
                        self._src_tokens - self._tar_tokens
                    )[src_only[col]]
                    intersection[tar_only[row]] += (sim / 2) * (
                        self._tar_tokens - self._src_tokens
                    )[tar_only[row]]

        return intersection
Example #51
0
    def findNewClusterCenters(self, ss=0):
        """Find a putative cluster"""

        inRange = lambda x, l, u: x >= l and x < u

        # we work from the top view as this has the base clustering
        max_index = np_argmax(self.blurredMaps[0])
        max_value = self.blurredMaps[0].ravel()[max_index]

        max_x = int(max_index / self.PM.scaleFactor)
        max_y = max_index - self.PM.scaleFactor * max_x
        max_z = -1

        ret_values = [max_value, max_x, max_y]

        start_span = int(1.5 * self.span)
        span_len = 2 * start_span + 1

        if self.debugPlots:
            self.plotRegion(max_x, max_y, max_z, fileName="Image_" + str(self.imageCounter), tag="column", column=True)
            self.imageCounter += 1

        # make a 3d grid to hold the values
        working_block = np_zeros((span_len, span_len, self.PM.scaleFactor))

        # go through the entire column
        (x_lower, x_upper) = self.makeCoordRanges(max_x, start_span)
        (y_lower, y_upper) = self.makeCoordRanges(max_y, start_span)
        super_putative_row_indices = []
        for p in self.im2RowIndicies:
            if inRange(p[0], x_lower, x_upper) and inRange(p[1], y_lower, y_upper):
                for row_index in self.im2RowIndicies[p]:
                    # check that the point is real and that it has not yet been binned
                    if row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies:
                        # this is an unassigned point.
                        multiplier = np_log10(self.PM.contigLengths[row_index])
                        self.incrementAboutPoint3D(
                            working_block, p[0] - x_lower, p[1] - y_lower, p[2], multiplier=multiplier
                        )
                        super_putative_row_indices.append(row_index)

        # blur and find the highest value
        bwb = ndi.gaussian_filter(working_block, 8)  # self.blurRadius)
        densest_index = np_unravel_index(np_argmax(bwb), (np_shape(bwb)))
        max_x = densest_index[0] + x_lower
        max_y = densest_index[1] + y_lower
        max_z = densest_index[2]

        # now get the basic color of this dense point
        putative_center_row_indices = []

        (x_lower, x_upper) = self.makeCoordRanges(max_x, self.span)
        (y_lower, y_upper) = self.makeCoordRanges(max_y, self.span)
        (z_lower, z_upper) = self.makeCoordRanges(max_z, 2 * self.span)

        for row_index in super_putative_row_indices:
            p = np_around(self.PM.transformedCP[row_index])
            if inRange(p[0], x_lower, x_upper) and inRange(p[1], y_lower, y_upper) and inRange(p[2], z_lower, z_upper):
                # we are within the range!
                putative_center_row_indices.append(row_index)

        # make sure we have something to go on here
        if np_size(putative_center_row_indices) == 0:
            # it's all over!
            return None

        if np_size(putative_center_row_indices) == 1:
            # get out of here but keep trying
            # the calling function may restrict these indices
            return [[np_array(putative_center_row_indices)], ret_values]
        else:
            total_BP = sum([self.PM.contigLengths[i] for i in putative_center_row_indices])
            if not self.isGoodBin(total_BP, len(putative_center_row_indices), ms=5):  # Can we trust very small bins?.
                # get out of here but keep trying
                # the calling function should restrict these indices
                return [[np_array(putative_center_row_indices)], ret_values]
            else:
                # we've got a few good guys here, partition them up!
                # shift these guys around a bit
                center_k_vals = np_array([self.PM.kmerVals[i] for i in putative_center_row_indices])
                k_partitions = self.partitionVals(center_k_vals)

                if len(k_partitions) == 0:
                    return None
                else:
                    center_c_vals = np_array([self.PM.transformedCP[i][-1] for i in putative_center_row_indices])
                    # center_c_vals = np_array([self.PM.averageCoverages[i] for i in putative_center_row_indices])
                    center_c_vals -= np_min(center_c_vals)
                    c_max = np_max(center_c_vals)
                    if c_max != 0:
                        center_c_vals /= c_max
                    c_partitions = self.partitionVals(center_c_vals)

                    # take the intersection of the two partitions
                    tmp_partition_hash_1 = {}
                    id = 1
                    for p in k_partitions:
                        for i in p:
                            tmp_partition_hash_1[i] = id
                        id += 1

                    tmp_partition_hash_2 = {}
                    id = 1
                    for p in c_partitions:
                        for i in p:
                            try:
                                tmp_partition_hash_2[(tmp_partition_hash_1[i], id)].append(i)
                            except KeyError:
                                tmp_partition_hash_2[(tmp_partition_hash_1[i], id)] = [i]
                        id += 1

                    partitions = [
                        np_array([putative_center_row_indices[i] for i in tmp_partition_hash_2[key]])
                        for key in tmp_partition_hash_2.keys()
                    ]

                    # pcs = [[self.PM.averageCoverages[i] for i in p] for p in partitions]
                    # print pcs
                    return [partitions, ret_values]
def csrmatrix2PETScMat(L):
    """
    Converts a sequential scipy sparse matrix (on process 0) to a PETSc
    Mat ('aij') matrix distributed on all processes
    input : L, scipy sparse matrix on proc 0
    output: PETSc matrix distributed on all procs
    """

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    # Get the data from the sequential scipy matrix
    if rank == 0:
        if L.format == 'csr':
            L2 = L
        else:
            L2 = L.tocsr()
        Ai  = L2.indptr
        Aj  = L2.indices
        Av  = L2.data
        nnz = len(Aj)
        n,m = L2.shape
    else:
        n   = None
        m   = None
        nnz = None
        Ai  = None
        Aj  = None
        Av  = None

    # Broadcast sizes
    n   = comm.bcast(n  ,root = 0)
    m   = comm.bcast(m  ,root = 0)
    nnz = comm.bcast(nnz,root = 0)

    B = PETSc.Mat()
    B.create(comm)
    B.setSizes([n, m])
    B.setType('aij')
    B.setFromOptions()

    # Create a vector to get the local sizes, so that preallocation can be done later
    V = PETSc.Vec()
    V.create(comm)
    V.setSizes(n)
    V.setFromOptions()
    istart,iend = V.getOwnershipRange()
    V.destroy()

    nloc = iend - istart

    Istart = comm.gather(istart,root = 0)
    Iend   = comm.gather(iend  ,root = 0)

    if rank == 0:
        nnzloc = np_zeros(comm.size,'int')
        for i in range(comm.size):
            j0        = Ai[Istart[i]]
            j1        = Ai[Iend  [i]]
            nnzloc[i] = j1 - j0
    else:
        nnzloc = None

    nnzloc = comm.scatter(nnzloc,root = 0)

    ai = np_zeros(nloc+1   ,PETSc.IntType)
    aj = np_zeros(nnzloc+1 ,PETSc.IntType)
    av = np_zeros(nnzloc+1 ,PETSc.ScalarType)

    if rank == 0:
        j0        = Ai[Istart[0]]
        j1        = Ai[Iend  [0]]
        ai[:nloc  ] = Ai[:nloc]
        aj[:nnzloc] = Aj[j0:j1]
        av[:nnzloc] = Av[j0:j1]

    for iproc in range(1,comm.size):
        if rank == 0:
            i0        = Istart[iproc]
            i1        = Iend  [iproc]
            j0        = Ai[i0]
            j1        = Ai[i1]
            comm.Send(Ai[i0:i1], dest=iproc, tag=77)
            comm.Send(Aj[j0:j1], dest=iproc, tag=78)
            comm.Send(Av[j0:j1], dest=iproc, tag=79)
        elif rank == iproc:
            comm.Recv(ai[:nloc  ], source=0, tag=77)
            comm.Recv(aj[:nnzloc], source=0, tag=78)
            comm.Recv(av[:nnzloc], source=0, tag=79)

    ai = ai- ai[0]
    ai[-1] = nnzloc+1

    B.setPreallocationCSR((ai,aj))
    B.setValuesCSR(ai,aj,av)
    B.assemble()

    return B
Example #53
0
def zeros(n, type='float32'):
    return(np_zeros(n, type))
Example #54
0
    def alignment(self, src, tar, score_only=False):
        """Return the ALINE alignments of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        score_only : bool
            Return the score only, not the alignments

        Returns
        -------
        list(tuple(float, str, str) or float
            ALINE alignments and their scores or the top score

        Examples
        --------
        >>> cmp = ALINE()
        >>> cmp.alignment('cat', 'hat')
        [(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')]
        >>> cmp.alignment('niall', 'neil')
        [(90.0, '‖ n i a ll ‖', '‖ n e i l  ‖')]
        >>> cmp.alignment('aluminum', 'catalan')
        [(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')]
        >>> cmp.alignment('atcg', 'tagc')
        [(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖'), (65.0, 'a ‖ tc - g ‖',
        '‖ t  a g ‖ c')]


        .. versionadded:: 0.4.0

        """

        def _sig_skip(seg):
            return self._c_skip

        def _sig_sub(seg1, seg2):
            return (
                self._c_sub
                - _delta(seg1, seg2)
                - _sig_vwl(seg1)
                - _sig_vwl(seg2)
            )

        def _sig_exp(seg1, seg2a, seg2b):
            return (
                self._c_exp
                - _delta(seg1, seg2a)
                - _delta(seg1, seg2b)
                - _sig_vwl(seg1)
                - max(_sig_vwl(seg2a), _sig_vwl(seg2b))
            )

        def _sig_vwl(seg):
            return (
                0.0
                if seg['manner'] > self.feature_weights['high vowel']
                else self._c_vwl
            )

        def _delta(seg1, seg2):
            features = (
                self.c_features
                if max(seg1['manner'], seg2['manner'])
                > self.feature_weights['high vowel']
                else self.v_features
            )
            diff = 0.0
            for f in features:
                diff += (
                    abs(seg1.get(f, 0.0) - seg2.get(f, 0.0)) * self.salience[f]
                )
            return diff

        def _retrieve(i, j, score, out):
            def _record(score, out):
                out.append(('‖', '‖'))
                for i1 in range(i - 1, -1, -1):
                    out.append((src[i1]['segment'], ''))
                for j1 in range(j - 1, -1, -1):
                    out.append(('', tar[j1]['segment']))
                if self._mode == 'global':
                    score += (i + j) * _sig_skip('')

                out = out[::-1]

                src_alignment = []
                tar_alignment = []

                out.append(('‖', '‖'))
                part = 0
                s_segment = ''
                t_segment = ''
                for ss, ts in out:
                    if ss == '‖':
                        if part % 2 == 0:
                            src_alignment.append(s_segment)
                            tar_alignment.append(t_segment)
                            s_segment = []
                            t_segment = []
                        else:
                            src_alignment.append(' '.join(s_segment))
                            tar_alignment.append(' '.join(t_segment))
                            s_segment = ''
                            t_segment = ''
                        part += 1
                    else:
                        if part % 2 == 0:
                            s_segment += ss
                            t_segment += ts
                        else:
                            s_segment.append(ss + ' ' * (len(ts) - len(ss)))
                            t_segment.append(ts + ' ' * (len(ss) - len(ts)))

                src_alignment = ' ‖ '.join(src_alignment).strip()
                tar_alignment = ' ‖ '.join(tar_alignment).strip()

                alignments.append((score, src_alignment, tar_alignment))
                return

            if s_mat[i, j] == 0:
                _record(score, out)
                return
            else:
                if (
                    i > 0
                    and j > 0
                    and s_mat[i - 1, j - 1]
                    + _sig_sub(src[i - 1], tar[j - 1])
                    + score
                    >= threshold
                ):
                    loc_out = deepcopy(out)
                    loc_out.append(
                        (src[i - 1]['segment'], tar[j - 1]['segment'])
                    )
                    _retrieve(
                        i - 1,
                        j - 1,
                        score + _sig_sub(src[i - 1], tar[j - 1]),
                        loc_out,
                    )
                    loc_out.pop()

                if (
                    j > 0
                    and s_mat[i, j - 1] + _sig_skip(tar[j - 1]) + score
                    >= threshold
                ):
                    loc_out = deepcopy(out)
                    loc_out.append(('-', tar[j - 1]['segment']))
                    _retrieve(i, j - 1, score + _sig_skip(tar[j - 1]), loc_out)
                    loc_out.pop()

                if (
                    i > 0
                    and j > 1
                    and s_mat[i - 1, j - 2]
                    + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1])
                    + score
                    >= threshold
                ):
                    loc_out = deepcopy(out)
                    loc_out.append(
                        (
                            src[i - 1]['segment'],
                            tar[j - 2]['segment'] + tar[j - 1]['segment'],
                        )
                    )
                    _retrieve(
                        i - 1,
                        j - 2,
                        score + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]),
                        loc_out,
                    )
                    loc_out.pop()

                if (
                    i > 0
                    and s_mat[i - 1, j] + _sig_skip(src[i - 1]) + score
                    >= threshold
                ):
                    loc_out = deepcopy(out)
                    loc_out.append((src[i - 1]['segment'], '-'))
                    _retrieve(i - 1, j, score + _sig_skip(src[i - 1]), loc_out)
                    loc_out.pop()

                if (
                    i > 1
                    and j > 0
                    and s_mat[i - 2, j - 1]
                    + _sig_exp(tar[j - 1], src[i - 2], src[i - 1])
                    + score
                    >= threshold
                ):
                    loc_out = deepcopy(out)
                    loc_out.append(
                        (
                            src[i - 2]['segment'] + src[i - 1]['segment'],
                            tar[j - 1]['segment'],
                        )
                    )
                    _retrieve(
                        i - 2,
                        j - 1,
                        score + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]),
                        loc_out,
                    )
                    loc_out.pop()

        sg_max = 0.0

        src = list(src)
        tar = list(tar)

        for ch in range(len(src)):
            if src[ch] in self._phones:
                seg = src[ch]
                src[ch] = dict(self._phones[src[ch]])
                src[ch]['segment'] = seg
        for ch in range(len(tar)):
            if tar[ch] in self._phones:
                seg = tar[ch]
                tar[ch] = dict(self._phones[tar[ch]])
                tar[ch]['segment'] = seg

        src = [fb for fb in src if isinstance(fb, dict)]
        tar = [fb for fb in tar if isinstance(fb, dict)]

        for i in range(1, len(src)):
            if 'supplemental' in src[i]:
                j = i - 1
                while j > -1:
                    if 'supplemental' not in src[j]:
                        for key, value in src[i].items():
                            if key != 'supplemental':
                                if key == 'segment':
                                    src[j]['segment'] += value
                                else:
                                    src[j][key] = value
                        j = 0
                    j -= 1
        src = [fb for fb in src if 'supplemental' not in fb]

        for i in range(1, len(tar)):
            if 'supplemental' in tar[i]:
                j = i - 1
                while j > -1:
                    if 'supplemental' not in tar[j]:
                        for key, value in tar[i].items():
                            if key != 'supplemental':
                                if key == 'segment':
                                    tar[j]['segment'] += value
                                else:
                                    tar[j][key] = value
                        j = 0
                    j -= 1
        tar = [fb for fb in tar if 'supplemental' not in fb]

        for i in range(len(src)):
            for key in src[i].keys():
                if key != 'segment':
                    src[i][key] = self.feature_weights[src[i][key]]
        for i in range(len(tar)):
            for key in tar[i].keys():
                if key != 'segment':
                    tar[i][key] = self.feature_weights[tar[i][key]]

        src_len = len(src)
        tar_len = len(tar)

        s_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)

        if self._mode == 'global':
            for i in range(1, src_len + 1):
                s_mat[i, 0] = s_mat[i - 1, 0] + _sig_skip(src[i - 1])
            for j in range(1, tar_len + 1):
                s_mat[0, j] = s_mat[0, j - 1] + _sig_skip(tar[j - 1])

        for i in range(1, src_len + 1):
            for j in range(1, tar_len + 1):
                s_mat[i, j] = max(
                    s_mat[i - 1, j] + _sig_skip(src[i - 1]),
                    s_mat[i, j - 1] + _sig_skip(tar[j - 1]),
                    s_mat[i - 1, j - 1] + _sig_sub(src[i - 1], tar[j - 1]),
                    s_mat[i - 1, j - 2]
                    + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1])
                    if j > 1
                    else NINF,
                    s_mat[i - 2, j - 1]
                    + _sig_exp(tar[j - 1], src[i - 2], src[i - 1])
                    if i > 1
                    else NINF,
                    0 if self._mode in {'local', 'half-local'} else NINF,
                )

                if s_mat[i, j] > sg_max:
                    if self._mode == 'semi-global':
                        if i == src_len or j == tar_len:
                            sg_max = s_mat[i, j]
                    else:
                        sg_max = s_mat[i, j]

        if self._mode in {'global', 'half-local'}:
            dp_score = s_mat[src_len, tar_len]
        else:
            dp_score = s_mat.max()

        if score_only:
            return dp_score

        threshold = (1 - self._epsilon) * dp_score

        alignments = []

        for i in range(1, src_len + 1):
            for j in range(1, tar_len + 1):
                if self._mode in {'global', 'half-local'} and (
                    i < src_len or j < tar_len
                ):
                    continue
                if self._mode == 'semi-global' and (
                    i < src_len and j < tar_len
                ):
                    continue
                if s_mat[i, j] >= threshold:
                    out = []
                    for j1 in range(tar_len - 1, j - 1, -1):
                        out.append(('', tar[j1]['segment']))
                    for i1 in range(src_len - 1, i - 1, -1):
                        out.append((src[i1]['segment'], ''))
                    out.append(('‖', '‖'))
                    _retrieve(i, j, 0, out)

        def _first_element(x):
            return x[0]

        return sorted(alignments, key=_first_element, reverse=True)
Example #55
0
 def encode(self, C, maxlen):
     """Encode as one-hot"""
     X = np_zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member
     for i, c in enumerate(C):
         X[i, self.char_indices[c]] = 1
     return X
Example #56
0
    def dist_abs(self, src, tar):
        """Return the Editex distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int
            Editex distance

        Examples
        --------
        >>> cmp = Editex()
        >>> cmp.dist_abs('cat', 'hat')
        2
        >>> cmp.dist_abs('Niall', 'Neil')
        2
        >>> cmp.dist_abs('aluminum', 'Catalan')
        12
        >>> cmp.dist_abs('ATCG', 'TAGC')
        6


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        match_cost, group_cost, mismatch_cost = self._cost

        def r_cost(ch1, ch2):
            """Return r(a,b) according to Zobel & Dart's definition.

            Parameters
            ----------
            ch1 : str
                The first character to compare
            ch2 : str
                The second character to compare

            Returns
            -------
            int
                r(a,b) according to Zobel & Dart's definition

            .. versionadded:: 0.1.0

            """
            if ch1 == ch2:
                return match_cost
            if ch1 in self._all_letters and ch2 in self._all_letters:
                for group in self._letter_groups:
                    if ch1 in group and ch2 in group:
                        return group_cost
            return mismatch_cost

        def d_cost(ch1, ch2):
            """Return d(a,b) according to Zobel & Dart's definition.

            Parameters
            ----------
            ch1 : str
                The first character to compare
            ch2 : str
                The second character to compare

            Returns
            -------
            int
                d(a,b) according to Zobel & Dart's definition

            .. versionadded:: 0.1.0

            """
            if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
                return group_cost
            return r_cost(ch1, ch2)

        # convert both src & tar to NFKD normalized unicode
        src = unicode_normalize('NFKD', text_type(src.upper()))
        tar = unicode_normalize('NFKD', text_type(tar.upper()))
        # convert ß to SS (for Python2)
        src = src.replace('ß', 'SS')
        tar = tar.replace('ß', 'SS')

        src_len = len(src)
        tar_len = len(tar)
        max_len = max(src_len, tar_len)

        if src == tar:
            return 0.0
        if not src:
            return sum(
                mismatch_cost * self._taper(pos, max_len)
                for pos in range(tar_len)
            )
        if not tar:
            return sum(
                mismatch_cost * self._taper(pos, max_len)
                for pos in range(src_len)
            )

        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
        src = ' ' + src
        tar = ' ' + tar

        if not self._local:
            for i in range(1, src_len + 1):
                d_mat[i, 0] = d_mat[i - 1, 0] + d_cost(
                    src[i - 1], src[i]
                ) * self._taper(i, max_len)
        for j in range(1, tar_len + 1):
            d_mat[0, j] = d_mat[0, j - 1] + d_cost(
                tar[j - 1], tar[j]
            ) * self._taper(j, max_len)

        for i in range(1, src_len + 1):
            for j in range(1, tar_len + 1):
                d_mat[i, j] = min(
                    d_mat[i - 1, j]
                    + d_cost(src[i - 1], src[i])
                    * self._taper(max(i, j), max_len),
                    d_mat[i, j - 1]
                    + d_cost(tar[j - 1], tar[j])
                    * self._taper(max(i, j), max_len),
                    d_mat[i - 1, j - 1]
                    + r_cost(src[i], tar[j]) * self._taper(max(i, j), max_len),
                )

        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
            return int(d_mat[src_len, tar_len])
        else:
            return d_mat[src_len, tar_len]
Example #57
0
 def blurMaps(self):
     """Blur the 2D image maps"""
     self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))
     for i in range(self.numImgMaps):  # top, front and side
         self.blurredMaps[i, :, :] = ndi.gaussian_filter(self.imageMaps[i, :, :], 8)  # self.blurRadius)
Example #58
0
    def dist_abs(self, src, tar):
        """Return the Damerau-Levenshtein distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int (may return a float if cost has float values)
            The Damerau-Levenshtein distance between src & tar

        Raises
        ------
        ValueError
            Unsupported cost assignment; the cost of two transpositions must
            not be less than the cost of an insert plus a delete.

        Examples
        --------
        >>> cmp = DamerauLevenshtein()
        >>> cmp.dist_abs('cat', 'hat')
        1
        >>> cmp.dist_abs('Niall', 'Neil')
        3
        >>> cmp.dist_abs('aluminum', 'Catalan')
        7
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        ins_cost, del_cost, sub_cost, trans_cost = self._cost

        if src == tar:
            return 0
        if not src:
            return len(tar) * ins_cost
        if not tar:
            return len(src) * del_cost

        if 2 * trans_cost < ins_cost + del_cost:
            raise ValueError(
                'Unsupported cost assignment; the cost of two transpositions '
                + 'must not be less than the cost of an insert plus a delete.'
            )

        d_mat = np_zeros((len(src), len(tar)), dtype=np_int)

        if src[0] != tar[0]:
            d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)

        src_index_by_character = {src[0]: 0}
        for i in range(1, len(src)):
            del_distance = d_mat[i - 1, 0] + del_cost
            ins_distance = (i + 1) * del_cost + ins_cost
            match_distance = i * del_cost + (
                0 if src[i] == tar[0] else sub_cost
            )
            d_mat[i, 0] = min(del_distance, ins_distance, match_distance)

        for j in range(1, len(tar)):
            del_distance = (j + 1) * ins_cost + del_cost
            ins_distance = d_mat[0, j - 1] + ins_cost
            match_distance = j * ins_cost + (
                0 if src[0] == tar[j] else sub_cost
            )
            d_mat[0, j] = min(del_distance, ins_distance, match_distance)

        for i in range(1, len(src)):
            max_src_letter_match_index = 0 if src[i] == tar[0] else -1
            for j in range(1, len(tar)):
                candidate_swap_index = (
                    -1
                    if tar[j] not in src_index_by_character
                    else src_index_by_character[tar[j]]
                )
                j_swap = max_src_letter_match_index
                del_distance = d_mat[i - 1, j] + del_cost
                ins_distance = d_mat[i, j - 1] + ins_cost
                match_distance = d_mat[i - 1, j - 1]
                if src[i] != tar[j]:
                    match_distance += sub_cost
                else:
                    max_src_letter_match_index = j

                if candidate_swap_index != -1 and j_swap != -1:
                    i_swap = candidate_swap_index

                    if i_swap == 0 and j_swap == 0:
                        pre_swap_cost = 0
                    else:
                        pre_swap_cost = d_mat[
                            max(0, i_swap - 1), max(0, j_swap - 1)
                        ]
                    swap_distance = (
                        pre_swap_cost
                        + (i - i_swap - 1) * del_cost
                        + (j - j_swap - 1) * ins_cost
                        + trans_cost
                    )
                else:
                    swap_distance = maxsize

                d_mat[i, j] = min(
                    del_distance, ins_distance, match_distance, swap_distance
                )
            src_index_by_character[src[i]] = i

        return d_mat[len(src) - 1, len(tar) - 1]
Example #59
0
    def dist_abs(self, src, tar):
        """Return the Levenshtein distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int (may return a float if cost has float values)
            The Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = Levenshtein()
        >>> cmp.dist_abs('cat', 'hat')
        1
        >>> cmp.dist_abs('Niall', 'Neil')
        3
        >>> cmp.dist_abs('aluminum', 'Catalan')
        7
        >>> cmp.dist_abs('ATCG', 'TAGC')
        3

        >>> cmp = Levenshtein(mode='osa')
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2
        >>> cmp.dist_abs('ACTG', 'TAGC')
        4


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        ins_cost, del_cost, sub_cost, trans_cost = self._cost

        src_len = len(src)
        tar_len = len(tar)
        max_len = max(src_len, tar_len)

        if src == tar:
            return 0
        if not src:
            return sum(
                ins_cost * self._taper(pos, max_len) for pos in range(tar_len)
            )
        if not tar:
            return sum(
                del_cost * self._taper(pos, max_len) for pos in range(src_len)
            )

        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
        for i in range(src_len + 1):
            d_mat[i, 0] = i * self._taper(i, max_len) * del_cost
        for j in range(tar_len + 1):
            d_mat[0, j] = j * self._taper(j, max_len) * ins_cost

        for i in range(src_len):
            for j in range(tar_len):
                d_mat[i + 1, j + 1] = min(
                    d_mat[i + 1, j]
                    + ins_cost * self._taper(1 + max(i, j), max_len),  # ins
                    d_mat[i, j + 1]
                    + del_cost * self._taper(1 + max(i, j), max_len),  # del
                    d_mat[i, j]
                    + (
                        sub_cost * self._taper(1 + max(i, j), max_len)
                        if src[i] != tar[j]
                        else 0
                    ),  # sub/==
                )

                if self._mode == 'osa':
                    if (
                        i + 1 > 1
                        and j + 1 > 1
                        and src[i] == tar[j - 1]
                        and src[i - 1] == tar[j]
                    ):
                        # transposition
                        d_mat[i + 1, j + 1] = min(
                            d_mat[i + 1, j + 1],
                            d_mat[i - 1, j - 1]
                            + trans_cost * self._taper(1 + max(i, j), max_len),
                        )

        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
            return int(d_mat[src_len, tar_len])
        else:
            return d_mat[src_len, tar_len]