Ejemplo n.º 1
0
def token_to_segment(token, segment_list, diacritic_list):
    '''Converts a string token in IPA to Segment object, given
    a list of dictionaries representing segments and the same representing
    diacritics.'''

    diacritic_strings = [segment['IPA'] for segment in diacritic_list]

    # Isolate the base IPA segment string
    base_string = ''.join(filter(lambda x: x not in diacritic_strings,
                                 token))

    # Isolate an iterable of diacritics present
    diacritics = [diacritic for diacritic in diacritic_list
                  if diacritic['IPA'] in token]

    # Initialise the base Segment
    segment = Segment.from_dictionary(find_segment(base_string,
                                                   segment_list))

    # Add each diacritic feature to the segment
    for diacritic in diacritics:
        diacritic_segment = Segment(diacritic['applies'].get('positive', []),
                                    diacritic['applies'].get('negative', []))
        segment = segment + diacritic_segment

    return segment
Ejemplo n.º 2
0
def token_to_segment(token, segment_list, diacritic_list):
    '''Converts a string token in IPA to Segment object, given
    a list of dictionaries representing segments and the same representing
    diacritics.'''

    diacritic_strings = [segment['IPA'] for segment in diacritic_list]

    # Isolate the base IPA segment string
    base_string = ''.join(filter(lambda x: x not in diacritic_strings, token))

    # Isolate an iterable of diacritics present
    diacritics = [
        diacritic for diacritic in diacritic_list if diacritic['IPA'] in token
    ]

    # Initialise the base Segment
    segment = Segment.from_dictionary(find_segment(base_string, segment_list))

    # Add each diacritic feature to the segment
    for diacritic in diacritics:
        diacritic_segment = Segment(diacritic['applies'].get('positive', []),
                                    diacritic['applies'].get('negative', []),
                                    diacritic['applies'].get('zero', []))
        segment = segment + diacritic_segment

    return segment
Ejemplo n.º 3
0
def test_initialisation():
    feature_dictionary = {'stress': '+', 'long': '-', 'continuant': '0',
                          'IPA': 'b'}

    segment = Segment.from_dictionary(feature_dictionary)

    assert segment.positive == ['stress']
    assert segment.negative == ['long']
Ejemplo n.º 4
0
def test_addition():
    feature_dictionary = {'stress': '+', 'syllabic': '-', 'continuant': '0',
                          'IPA': 'b'}

    segment = Segment.from_dictionary(feature_dictionary)

    syllabic_diacritic = Segment(['syllabic'], ['voice'])

    addition = segment + syllabic_diacritic
    assert addition.positive == ['stress', 'syllabic']
    assert addition.negative == ['voice']
Ejemplo n.º 5
0
def test_initialisation():
    feature_dictionary = {
        'stress': '+',
        'long': '-',
        'continuant': '0',
        'IPA': 'b'
    }

    segment = Segment.from_dictionary(feature_dictionary)

    assert segment.positive == ['stress']
    assert segment.negative == ['long']
def benchmark_match_accuracy(segments, diacritics, filename):
    '''Convert all given segments to feature strings, then convert back to
    segments. Use the given feature string file. Return the percentage accuracy
    of the conversion.

    '''
    feature_strings = load_feature_strings(
        path.join(base_directory, 'engine', 'data', filename))

    print('Loaded {0} feature strings'.format(len(feature_strings)))

    base_matches = []
    matches = []

    deparse.initialise_cache()

    for segment in segments:
        base_segment = Segment.from_dictionary(segment)

        base_matches.append((segment['IPA'],
                             deparse.segment_match(feature_strings,
                                                   base_segment)))
        matches.append((segment['IPA'],
                        deparse.segment_match(feature_strings, base_segment)))

        for diacritic in diacritics:
            IPA_representation = segment['IPA'] + diacritic['IPA']

            if base_segment.meets_conditions(diacritic['conditions']):
                diacritic_segment = base_segment + Segment(
                    diacritic['applies'].get('positive', []),
                    diacritic['applies'].get('negative', []))

                matches.append((IPA_representation,
                                deparse.segment_match(feature_strings,
                                                      diacritic_segment)))

    print('Calculating base accuracy...')
    base_successes = 0
    for match in base_matches:
        if match[0] == match[1]:
            base_successes += 1
        else:
            print('\tExpected {0}, deparsed {1}'.format(match[0], match[1]))

    print('Calculating diacritic accuracy...')
    successes = len([match for match in matches if match[0] == match[1]])

    return (base_successes / len(base_matches)), (successes / len(matches))
Ejemplo n.º 7
0
def test_addition():
    feature_dictionary = {
        'stress': '+',
        'syllabic': '-',
        'continuant': '0',
        'IPA': 'b'
    }

    segment = Segment.from_dictionary(feature_dictionary)

    syllabic_diacritic = Segment(['syllabic'], ['voice'])

    addition = segment + syllabic_diacritic
    assert addition.positive == ['stress', 'syllabic']
    assert addition.negative == ['voice']
Ejemplo n.º 8
0
def benchmark_match_accuracy(segments, diacritics, filename):
    '''Convert all given segments to feature strings, then convert back to
    segments. Use the given feature string file. Return the percentage accuracy
    of the conversion.

    '''
    feature_strings = load_feature_strings(path.join(base_directory, 'engine',
                                                     'data', filename))

    print('Loaded {0} feature strings'.format(len(feature_strings)))

    base_matches = []
    matches = []

    deparse.initialise_cache()

    for segment in segments:
        base_segment = Segment.from_dictionary(segment)

        base_matches.append((segment['IPA'],
                             deparse.segment_match(feature_strings,
                                                   base_segment)))
        matches.append((segment['IPA'], deparse.segment_match(feature_strings,
                                                              base_segment)))

        for diacritic in diacritics:
            IPA_representation = segment['IPA'] + diacritic['IPA']

            if base_segment.meets_conditions(diacritic['conditions']):
                diacritic_segment = base_segment + Segment(diacritic['applies'].get('positive', []),
                                                           diacritic['applies'].get('negative', []))

                matches.append((IPA_representation,
                                deparse.segment_match(feature_strings,
                                                      diacritic_segment)))

    print('Calculating base accuracy...')
    base_successes = 0
    for match in base_matches:
        if match[0] == match[1]:
            base_successes += 1
        else:
            print('\tExpected {0}, deparsed {1}'.format(match[0], match[1]))

    print('Calculating diacritic accuracy...')
    successes = len([match for match in matches if match[0] == match[1]])

    return (base_successes / len(base_matches)), (successes / len(matches))
Ejemplo n.º 9
0
def test_setters():
    feature_dictionary = {'stress': '+', 'long': '-', 'continuant': '0',
                          'IPA': 'b'}

    segment = Segment.from_dictionary(feature_dictionary)

    segment.add_positive('long')
    assert segment.positive == ['stress', 'long']
    assert segment.negative == []

    segment.add_negative('stress')
    assert segment.positive == ['long']
    assert segment.negative == ['stress']

    segment.add_negative('stress')
    assert segment.positive == ['long']
    assert segment.negative == ['stress']
Ejemplo n.º 10
0
def test_setters():
    feature_dictionary = {
        'stress': '+',
        'long': '-',
        'continuant': '0',
        'IPA': 'b'
    }

    segment = Segment.from_dictionary(feature_dictionary)

    segment.add_positive('long')
    assert segment.positive == ['stress', 'long']
    assert segment.negative == []

    segment.add_negative('stress')
    assert segment.positive == ['long']
    assert segment.negative == ['stress']

    segment.add_negative('stress')
    assert segment.positive == ['long']
    assert segment.negative == ['stress']
def main():

    # Add IPA strings here to print segment values when encountered
    diagnostic_targets = []

    segments = load_segments(
        path.join(base_directory, 'engine', 'data', 'features.csv'))

    with open(path.join(base_directory, 'engine', 'data',
                        'diacritics.yaml')) as f:
        diacritics = yaml.load(f)

    print('Generating basic feature strings')
    print('================================')
    feature_strings = []

    for segment in segments:
        base_segment = Segment.from_dictionary(segment)
        feature_strings.append(
            (segment['IPA'], deparse.feature_string(base_segment)))

    print('\nGenerated {0} feature strings'.format(len(feature_strings)))
    print_duplicates(feature_strings)

    print('\nGenerating diacritic feature strings')
    print('================================')
    feature_strings = []

    for segment in segments:
        base_segment = Segment.from_dictionary(segment)
        feature_strings.append(
            (segment['IPA'], deparse.feature_string(base_segment)))

        if segment['IPA'] in diagnostic_targets:
            print('Target found: {0}'.format(segment['IPA']))
            print('\tPositive: {0}'.format(base_segment.positive))
            print('\tNegative: {0}'.format(base_segment.negative))

        for diacritic in diacritics:
            IPA_representation = segment['IPA'] + diacritic['IPA']

            if base_segment.meets_conditions(diacritic['conditions']):
                diacritic_segment = base_segment + Segment(
                    diacritic['applies'].get('positive', []),
                    diacritic['applies'].get('negative', []))
                feature_strings.append(
                    (IPA_representation,
                     deparse.feature_string(diacritic_segment)))

                if base_segment.meets_conditions({
                        'positive': ['syllabic']
                }) and not diacritic_segment.meets_conditions(
                    {'positive': ['long']}):
                    long_segment = base_segment + Segment(
                        diacritic['applies'].get(
                            'positive', []), diacritic['applies'].get(
                                'negative', [])) + Segment(['long'], [])

                    feature_strings.append(
                        (IPA_representation + 'ː',
                         deparse.feature_string(long_segment)))

                if IPA_representation in diagnostic_targets:
                    print('Target found: {0}'.format(IPA_representation))
                    print('\tPositive: {0}'.format(diacritic_segment.positive))
                    print('\tNegative: {0}'.format(diacritic_segment.negative))

    print('\nGenerated {0} feature strings'.format(len(feature_strings)))
    print_duplicates(feature_strings)

    with open('feature-strings-with-diacritics.csv', 'w') as f:
        csv.writer(f).writerows(feature_strings)