def token_to_segment(token, segment_list, diacritic_list): '''Converts a string token in IPA to Segment object, given a list of dictionaries representing segments and the same representing diacritics.''' diacritic_strings = [segment['IPA'] for segment in diacritic_list] # Isolate the base IPA segment string base_string = ''.join(filter(lambda x: x not in diacritic_strings, token)) # Isolate an iterable of diacritics present diacritics = [diacritic for diacritic in diacritic_list if diacritic['IPA'] in token] # Initialise the base Segment segment = Segment.from_dictionary(find_segment(base_string, segment_list)) # Add each diacritic feature to the segment for diacritic in diacritics: diacritic_segment = Segment(diacritic['applies'].get('positive', []), diacritic['applies'].get('negative', [])) segment = segment + diacritic_segment return segment
def token_to_segment(token, segment_list, diacritic_list): '''Converts a string token in IPA to Segment object, given a list of dictionaries representing segments and the same representing diacritics.''' diacritic_strings = [segment['IPA'] for segment in diacritic_list] # Isolate the base IPA segment string base_string = ''.join(filter(lambda x: x not in diacritic_strings, token)) # Isolate an iterable of diacritics present diacritics = [ diacritic for diacritic in diacritic_list if diacritic['IPA'] in token ] # Initialise the base Segment segment = Segment.from_dictionary(find_segment(base_string, segment_list)) # Add each diacritic feature to the segment for diacritic in diacritics: diacritic_segment = Segment(diacritic['applies'].get('positive', []), diacritic['applies'].get('negative', []), diacritic['applies'].get('zero', [])) segment = segment + diacritic_segment return segment
def test_initialisation(): feature_dictionary = {'stress': '+', 'long': '-', 'continuant': '0', 'IPA': 'b'} segment = Segment.from_dictionary(feature_dictionary) assert segment.positive == ['stress'] assert segment.negative == ['long']
def test_addition(): feature_dictionary = {'stress': '+', 'syllabic': '-', 'continuant': '0', 'IPA': 'b'} segment = Segment.from_dictionary(feature_dictionary) syllabic_diacritic = Segment(['syllabic'], ['voice']) addition = segment + syllabic_diacritic assert addition.positive == ['stress', 'syllabic'] assert addition.negative == ['voice']
def test_initialisation(): feature_dictionary = { 'stress': '+', 'long': '-', 'continuant': '0', 'IPA': 'b' } segment = Segment.from_dictionary(feature_dictionary) assert segment.positive == ['stress'] assert segment.negative == ['long']
def benchmark_match_accuracy(segments, diacritics, filename): '''Convert all given segments to feature strings, then convert back to segments. Use the given feature string file. Return the percentage accuracy of the conversion. ''' feature_strings = load_feature_strings( path.join(base_directory, 'engine', 'data', filename)) print('Loaded {0} feature strings'.format(len(feature_strings))) base_matches = [] matches = [] deparse.initialise_cache() for segment in segments: base_segment = Segment.from_dictionary(segment) base_matches.append((segment['IPA'], deparse.segment_match(feature_strings, base_segment))) matches.append((segment['IPA'], deparse.segment_match(feature_strings, base_segment))) for diacritic in diacritics: IPA_representation = segment['IPA'] + diacritic['IPA'] if base_segment.meets_conditions(diacritic['conditions']): diacritic_segment = base_segment + Segment( diacritic['applies'].get('positive', []), diacritic['applies'].get('negative', [])) matches.append((IPA_representation, deparse.segment_match(feature_strings, diacritic_segment))) print('Calculating base accuracy...') base_successes = 0 for match in base_matches: if match[0] == match[1]: base_successes += 1 else: print('\tExpected {0}, deparsed {1}'.format(match[0], match[1])) print('Calculating diacritic accuracy...') successes = len([match for match in matches if match[0] == match[1]]) return (base_successes / len(base_matches)), (successes / len(matches))
def test_addition(): feature_dictionary = { 'stress': '+', 'syllabic': '-', 'continuant': '0', 'IPA': 'b' } segment = Segment.from_dictionary(feature_dictionary) syllabic_diacritic = Segment(['syllabic'], ['voice']) addition = segment + syllabic_diacritic assert addition.positive == ['stress', 'syllabic'] assert addition.negative == ['voice']
def benchmark_match_accuracy(segments, diacritics, filename): '''Convert all given segments to feature strings, then convert back to segments. Use the given feature string file. Return the percentage accuracy of the conversion. ''' feature_strings = load_feature_strings(path.join(base_directory, 'engine', 'data', filename)) print('Loaded {0} feature strings'.format(len(feature_strings))) base_matches = [] matches = [] deparse.initialise_cache() for segment in segments: base_segment = Segment.from_dictionary(segment) base_matches.append((segment['IPA'], deparse.segment_match(feature_strings, base_segment))) matches.append((segment['IPA'], deparse.segment_match(feature_strings, base_segment))) for diacritic in diacritics: IPA_representation = segment['IPA'] + diacritic['IPA'] if base_segment.meets_conditions(diacritic['conditions']): diacritic_segment = base_segment + Segment(diacritic['applies'].get('positive', []), diacritic['applies'].get('negative', [])) matches.append((IPA_representation, deparse.segment_match(feature_strings, diacritic_segment))) print('Calculating base accuracy...') base_successes = 0 for match in base_matches: if match[0] == match[1]: base_successes += 1 else: print('\tExpected {0}, deparsed {1}'.format(match[0], match[1])) print('Calculating diacritic accuracy...') successes = len([match for match in matches if match[0] == match[1]]) return (base_successes / len(base_matches)), (successes / len(matches))
def test_setters(): feature_dictionary = {'stress': '+', 'long': '-', 'continuant': '0', 'IPA': 'b'} segment = Segment.from_dictionary(feature_dictionary) segment.add_positive('long') assert segment.positive == ['stress', 'long'] assert segment.negative == [] segment.add_negative('stress') assert segment.positive == ['long'] assert segment.negative == ['stress'] segment.add_negative('stress') assert segment.positive == ['long'] assert segment.negative == ['stress']
def test_setters(): feature_dictionary = { 'stress': '+', 'long': '-', 'continuant': '0', 'IPA': 'b' } segment = Segment.from_dictionary(feature_dictionary) segment.add_positive('long') assert segment.positive == ['stress', 'long'] assert segment.negative == [] segment.add_negative('stress') assert segment.positive == ['long'] assert segment.negative == ['stress'] segment.add_negative('stress') assert segment.positive == ['long'] assert segment.negative == ['stress']
def main(): # Add IPA strings here to print segment values when encountered diagnostic_targets = [] segments = load_segments( path.join(base_directory, 'engine', 'data', 'features.csv')) with open(path.join(base_directory, 'engine', 'data', 'diacritics.yaml')) as f: diacritics = yaml.load(f) print('Generating basic feature strings') print('================================') feature_strings = [] for segment in segments: base_segment = Segment.from_dictionary(segment) feature_strings.append( (segment['IPA'], deparse.feature_string(base_segment))) print('\nGenerated {0} feature strings'.format(len(feature_strings))) print_duplicates(feature_strings) print('\nGenerating diacritic feature strings') print('================================') feature_strings = [] for segment in segments: base_segment = Segment.from_dictionary(segment) feature_strings.append( (segment['IPA'], deparse.feature_string(base_segment))) if segment['IPA'] in diagnostic_targets: print('Target found: {0}'.format(segment['IPA'])) print('\tPositive: {0}'.format(base_segment.positive)) print('\tNegative: {0}'.format(base_segment.negative)) for diacritic in diacritics: IPA_representation = segment['IPA'] + diacritic['IPA'] if base_segment.meets_conditions(diacritic['conditions']): diacritic_segment = base_segment + Segment( diacritic['applies'].get('positive', []), diacritic['applies'].get('negative', [])) feature_strings.append( (IPA_representation, deparse.feature_string(diacritic_segment))) if base_segment.meets_conditions({ 'positive': ['syllabic'] }) and not diacritic_segment.meets_conditions( {'positive': ['long']}): long_segment = base_segment + Segment( diacritic['applies'].get( 'positive', []), diacritic['applies'].get( 'negative', [])) + Segment(['long'], []) feature_strings.append( (IPA_representation + 'ː', deparse.feature_string(long_segment))) if IPA_representation in diagnostic_targets: print('Target found: {0}'.format(IPA_representation)) print('\tPositive: {0}'.format(diacritic_segment.positive)) print('\tNegative: {0}'.format(diacritic_segment.negative)) print('\nGenerated {0} feature strings'.format(len(feature_strings))) print_duplicates(feature_strings) with open('feature-strings-with-diacritics.csv', 'w') as f: csv.writer(f).writerows(feature_strings)