Ejemplo n.º 1
    def score(self,
        Matches the string against the GAST using
        the algorithm described in [Chernyak, sections 1.3 & 1.4].
        Expects the input string to consist of
        alphabet letters only (no whitespaces etc.)
        Returns the score (a float in [0, 1]).
        query -- Unicode

        query = query.replace(" ", "")
        result = 0
        suffix_scores = {}

        # For each suffix of the string:
        for suffix_start in range(len(query)):

            suffix = query[suffix_start:]
            suffix_score = 0
            suffix_result = 0
            matched_chars = 0
            nodes_matched = 0

            child_node = self.root.chose_arc(suffix)
            while child_node:
                nodes_matched += 1
                (str_ind, substr_start, substr_end) = child_node.arc()
                match = utils.match_strings(
                suffix_score += child_node.conditional_probability()
                matched_chars += match
                suffix = suffix[match:]
                if suffix and match == substr_end - substr_start:
                    child_node = child_node.chose_arc(suffix)

            if matched_chars:
                suffix_result = (suffix_score + matched_chars - nodes_matched)
                if normalized:
                    suffix_result /= matched_chars
                result += suffix_result

            suffix_scores[query[suffix_start:]] = suffix_result

        result /= len(query)

        if return_suffix_scores:
            result = result, suffix_scores

        return result
Ejemplo n.º 2
 def _ukkonen_first_phases(string_ind):
     Looks for the part of the string which is already encoded.
     Returns a tuple of form
     ([length of already encoded string preffix],
      [tree node to start the first explicit phase with],
      [path to go down at the beginning of the first explicit phase]).
     already_in_tree = 0
     suffix = strings_collection[string_ind]
     starting_path = (0, 0, 0)
     starting_node = root
     child_node = starting_node.chose_arc(suffix)
     while child_node:
         (str_ind, substr_start, substr_end) = child_node.arc()
         match = utils.match_strings(
                     suffix, strings_collection[str_ind][substr_start:substr_end])
         already_in_tree += match
         if match == substr_end-substr_start:
             # matched the arc, proceed with child node
             suffix = suffix[match:]
             starting_node = child_node
             child_node = starting_node.chose_arc(suffix)
             # otherwise we will have to proceed certain path at the beginning
             # of the first explicit phase
             starting_path = (str_ind, substr_start, substr_start+match)
     # For constant updating of all leafs, see [Gusfield {RUS}, p. 139]
     root._e[string_ind] = already_in_tree
     return (already_in_tree, starting_node, starting_path)
Ejemplo n.º 4
    def _score(self, query, normalized=True, return_suffix_scores=False):
        result = 0
        suffix_scores = {}
        n = len(self.suftab)

        root_interval = (0, 0, n - 1)

        for suffix_start in range(len(query)):

            suffix = query[suffix_start:]
            suffix_score = 0
            suffix_result = 0
            matched_chars = 0
            nodes_matched = 0

            parent_node = root_interval
            child_node = self._get_child_interval(parent_node[1],
                                                  parent_node[2], suffix[0])
            while child_node:
                nodes_matched += 1
                # TODO: Use structs??? child_node[1] is actually cn.i; parent_node[0] == pn.l
                substr_start = self.suftab[child_node[1]] + parent_node[0]
                if self._is_leaf(child_node):
                    substr_end = n
                    substr_end = substr_start + child_node[0] - parent_node[0]
                match = utils.match_strings(
                    suffix, self.string[substr_start:substr_end])
                suffix_score += float(self._annotation(
                    child_node)) / self._annotation(parent_node)
                matched_chars += match
                suffix = suffix[match:]
                if suffix and match == substr_end - substr_start:
                    parent_node = child_node
                    child_node = self._get_child_interval(
                        parent_node[1], parent_node[2], suffix[0])

            if matched_chars:
                suffix_result = (suffix_score + matched_chars - nodes_matched)
                if normalized:
                    suffix_result /= matched_chars
                result += suffix_result

            suffix_scores[query[suffix_start:]] = suffix_result

        result /= len(query)

        if return_suffix_scores:
            result = result, suffix_scores

        return result
Ejemplo n.º 6
 def test_match_strings_empty(self):
     self.assertEqual(utils.match_strings("abc", "bc"), 0)
     self.assertEqual(utils.match_strings("", ""), 0)
