def test_fix_string_case(self): """Test phonetic-compatible case-transformations of strings This ensures validate.fix_strings function works as expected. It should properly change text to lowercase but retain case-sensitive characters defined in config as uppercase. """ # 'ABOL taBOL' should become 'abOl tabOl' self.assertEqual(validate.fix_string_case('ABOL taBOl'), 'abOl tabOl') # 'KhiCuRi' should become 'khicuRi' self.assertEqual(validate.fix_string_case('KhiCuRi'), 'khicuRi') # 'KaTh-BuRO' should become 'kaTh-buRO' self.assertEqual(validate.fix_string_case('KaTh-BuRO'), 'kaTh-buRO') # 'raMgoRurer Chana' should become 'ramgoRurer chana' self.assertEqual( validate.fix_string_case('raMgoRurer Chana'), 'ramgoRurer chana')
def test_fix_string_case(self): """Test phonetic-compatible case-transformations of strings This ensures validate.fix_strings function works as expected. It should properly change text to lowercase but retain case-sensitive characters defined in config as uppercase. """ # 'ABOL taBOL' should become 'abOl tabOl' self.assertEquals(validate.fix_string_case('ABOL taBOl'), 'abOl tabOl') # 'KhiCuRi' should become 'khicuRi' self.assertEquals(validate.fix_string_case('KhiCuRi'), 'khicuRi') # 'KaTh-BuRO' should become 'kaTh-buRO' self.assertEquals(validate.fix_string_case('KaTh-BuRO'), 'kaTh-buRO') # 'raMgoRurer Chana' should become 'ramgoRurer chana' self.assertEquals(validate.fix_string_case('raMgoRurer Chana'), 'ramgoRurer chana')
def parse(text): """Parses input text, matches and replaces using avrodict If a valid replacement is found, returns the replaced string. If no replacement is found, returns the input text. Usage: :: from pyavrophonetic import avro avro.parse("ami banglay gan gai") """ # Sanitize text case to meet phonetic comparison standards fixed_text = validate.fix_string_case(utf(text)) # prepare output list output = [] # cursor end point cur_end = 0 # iterate through input text for cur, i in enumerate(fixed_text): # Trap characters with unicode encoding errors try: i.encode('utf-8') except UnicodeDecodeError: uni_pass = False else: uni_pass = True # Default value for match match = {'matched': False} # Check cur is greater than or equals cur_end. If cursor is in # a position that has alread been processed/replaced, we don't # process anything at all if not uni_pass: cur_end = cur + 1 output.append(i) elif cur >= cur_end and uni_pass: # Try looking in non rule patterns with current string portion match = match_non_rule_patterns(fixed_text, cur) # Check if non rule patterns have matched if match["matched"]: output.append(match["replaced"]) cur_end = cur + len(match["found"]) else: # if non rule patterns have not matched, try rule patterns match = match_rule_patterns(fixed_text, cur) # Check if rule patterns have matched if match["matched"]: # Update cur_end as cursor + length of match found cur_end = cur + len(match["found"]) # Process its rules replaced = process_rules(rules = match["rules"], fixed_text = fixed_text, cur = cur, cur_end = cur_end) # If any rules match, output replacement from the # rule, else output it's default top-level/default # replacement if replaced is not None: # Rule has matched output.append(replaced) else: # No rules have matched # output common match output.append(match["replaced"]) # If none matched, append present cursor value if not match["matched"]: cur_end = cur + 1 output.append(i) # End looping through input text and produce output return ''.join(output)