def test_fix_string_case(self):
        """Test phonetic-compatible case-transformations of strings

        This ensures validate.fix_strings function works as
        expected. It should properly change text to lowercase but
        retain case-sensitive characters defined in config as
        uppercase.
        """
        # 'ABOL taBOL' should become 'abOl tabOl'
        self.assertEqual(validate.fix_string_case('ABOL taBOl'), 'abOl tabOl')
        # 'KhiCuRi' should become 'khicuRi'
        self.assertEqual(validate.fix_string_case('KhiCuRi'), 'khicuRi')
        # 'KaTh-BuRO' should become 'kaTh-buRO'
        self.assertEqual(validate.fix_string_case('KaTh-BuRO'), 'kaTh-buRO')
        # 'raMgoRurer Chana' should become 'ramgoRurer chana'
        self.assertEqual(
            validate.fix_string_case('raMgoRurer Chana'), 'ramgoRurer chana')
Ejemplo n.º 2
0
    def test_fix_string_case(self):
        """Test phonetic-compatible case-transformations of strings

        This ensures validate.fix_strings function works as
        expected. It should properly change text to lowercase but
        retain case-sensitive characters defined in config as
        uppercase.
        """
        # 'ABOL taBOL' should become 'abOl tabOl'
        self.assertEquals(validate.fix_string_case('ABOL taBOl'), 'abOl tabOl')
        # 'KhiCuRi' should become 'khicuRi'
        self.assertEquals(validate.fix_string_case('KhiCuRi'), 'khicuRi')
        # 'KaTh-BuRO' should become 'kaTh-buRO'
        self.assertEquals(validate.fix_string_case('KaTh-BuRO'), 'kaTh-buRO')
        # 'raMgoRurer Chana' should become 'ramgoRurer chana'
        self.assertEquals(validate.fix_string_case('raMgoRurer Chana'),
                          'ramgoRurer chana')
Ejemplo n.º 3
0
def parse(text):
    """Parses input text, matches and replaces using avrodict

    If a valid replacement is found, returns the replaced string. If
    no replacement is found, returns the input text.

    Usage:

    ::
      from pyavrophonetic import avro
      avro.parse("ami banglay gan gai")

    """
    # Sanitize text case to meet phonetic comparison standards
    fixed_text = validate.fix_string_case(utf(text))
    # prepare output list
    output = []
    # cursor end point
    cur_end = 0
    # iterate through input text
    for cur, i in enumerate(fixed_text):
        # Trap characters with unicode encoding errors
        try:
            i.encode('utf-8')
        except UnicodeDecodeError:
            uni_pass = False
        else:
            uni_pass = True
        # Default value for match
        match = {'matched': False}
        # Check cur is greater than or equals cur_end. If cursor is in
        # a position that has alread been processed/replaced, we don't
        # process anything at all
        if not uni_pass:
            cur_end = cur + 1
            output.append(i)
        elif cur >= cur_end and uni_pass:
            # Try looking in non rule patterns with current string portion
            match = match_non_rule_patterns(fixed_text, cur)
            # Check if non rule patterns have matched
            if match["matched"]:
                output.append(match["replaced"])
                cur_end = cur + len(match["found"])
            else:
            # if non rule patterns have not matched, try rule patterns
                match = match_rule_patterns(fixed_text, cur)
                # Check if rule patterns have matched
                if match["matched"]:
                    # Update cur_end as cursor + length of match found
                    cur_end =  cur + len(match["found"])
                    # Process its rules
                    replaced = process_rules(rules = match["rules"],
                                             fixed_text = fixed_text,
                                             cur = cur, cur_end = cur_end)
                    # If any rules match, output replacement from the
                    # rule, else output it's default top-level/default
                    # replacement
                    if replaced is not None:
                        # Rule has matched
                        output.append(replaced)
                    else:
                        # No rules have matched
                        # output common match
                        output.append(match["replaced"])

            # If none matched, append present cursor value
            if not match["matched"]:
                cur_end = cur + 1
                output.append(i)

    # End looping through input text and produce output
    return ''.join(output)