Beispiel #1
0
def test_lexicon_enrichment(timed_config, csv_test_dir):
    path = os.path.join(csv_test_dir, 'timed_enrichment.txt')
    with CorpusContext(timed_config) as c:
        enrich_lexicon_from_csv(c, path)

        q = c.query_graph(c.word).filter(c.word.neighborhood_density < 10)

        q = q.columns(c.word.label.column_name('label'))

        res = q.all()

        assert (all(x['label'] == 'guess' for x in res))

        q = c.query_graph(c.word).filter(c.word.label == 'i')

        res = q.all()

        assert (res[0]['frequency'] == 150)
        assert (res[0]['part_of_speech'] == 'PRP')
        assert (res[0]['neighborhood_density'] == 17)

        q = c.query_graph(c.word).filter(c.word.label == 'cute')

        res = q.all()

        assert (res[0]['frequency'] is None)
        assert (res[0]['part_of_speech'] == 'JJ')
        assert (res[0]['neighborhood_density'] == 14)

        levels = c.lexicon.get_property_levels('part_of_speech')
        assert (set(levels) == set(['NN', 'VB', 'JJ', 'IN', 'PRP']))
Beispiel #2
0
def test_lexicon_enrichment(timed_config, csv_test_dir):
    path = os.path.join(csv_test_dir, 'timed_enrichment.txt')
    with CorpusContext(timed_config) as c:
        enrich_lexicon_from_csv(c, path)

        q = c.query_graph(c.word).filter(c.word.neighborhood_density < 10)

        q = q.columns(c.word.label.column_name('label'))

        res = q.all()

        assert(all(x['label'] == 'guess' for x in res))

        q = c.query_graph(c.word).filter(c.word.label == 'i')

        res = q.all()

        assert(res[0]['frequency'] == 150)
        assert(res[0]['part_of_speech'] == 'PRP')
        assert(res[0]['neighborhood_density'] == 17)

        q = c.query_graph(c.word).filter(c.word.label == 'cute')

        res = q.all()

        assert(res[0]['frequency'] is None)
        assert(res[0]['part_of_speech'] == 'JJ')
        assert(res[0]['neighborhood_density'] == 14)

        levels = c.lexicon.get_property_levels('part_of_speech')
        assert(set(levels) == set(['NN','VB','JJ','IN','PRP']))
Beispiel #3
0
def lexicon_enrichment(config, unisyn_spade_directory, dialect_code):
    enrichment_dir = os.path.join(unisyn_spade_directory, 'enrichment_files')
    if not os.path.exists(enrichment_dir):
        print('Could not find enrichment_files directory from {}, skipping lexical enrichment.'.format(
            unisyn_spade_directory))
        return
    with CorpusContext(config) as g:

        for lf in os.listdir(enrichment_dir):
            path = os.path.join(enrichment_dir, lf)
            if lf == 'rule_applications.csv':
                if g.hierarchy.has_type_property('word', 'UnisynPrimStressedVowel1'.lower()):
                    print('Dialect independent enrichment already loaded, skipping.')
                    continue
            elif lf.startswith(dialect_code):
                if g.hierarchy.has_type_property('word', 'UnisynPrimStressedVowel2_{}'.format(
                        dialect_code).lower()):
                    print('Dialect specific enrichment already loaded, skipping.')
                    continue
            else:
                continue
            begin = time.time()
            enrich_lexicon_from_csv(g, path)
            time_taken = time.time() - begin
            print('Lexicon enrichment took: {}'.format(time.time() - begin))
            save_performance_benchmark(config, 'lexicon_enrichment', time_taken)
Beispiel #4
0
 def run_query(self):
     config = self.kwargs['config']
     case_sensitive = self.kwargs['case_sensitive']
     path = self.kwargs['path']
     stop_check = self.kwargs['stop_check']
     call_back = self.kwargs['call_back']
     call_back('Enriching lexicon...')
     call_back(0, 0)
     with CorpusContext(config) as c:
         enrich_lexicon_from_csv(c, path)
         if stop_check():
             call_back('Resetting lexicon...')
             call_back(0, 0)
             c.reset_lexicon()
             return False
     return True
Beispiel #5
0
 def run_query(self):
     config = self.kwargs['config']
     case_sensitive = self.kwargs['case_sensitive']
     path = self.kwargs['path']
     stop_check = self.kwargs['stop_check']
     call_back = self.kwargs['call_back']
     call_back('Enriching lexicon...')
     call_back(0, 0)
     with CorpusContext(config) as c:
         enrich_lexicon_from_csv(c, path)
         if stop_check():
             call_back('Resetting lexicon...')
             call_back(0, 0)
             c.reset_lexicon()
             return False
     return True
 def run_query(self):
     print("in the lexical worker")
     config = self.kwargs['config']
     case_sensitive = self.kwargs['case_sensitive']
     path = self.kwargs['path']
     stop_check = self.kwargs['stop_check']
     call_back = self.kwargs['call_back']
     call_back('Enriching lexicon...')
     call_back(0, 0)
     with CorpusContext(config) as c:
         enrich_lexicon_from_csv(c, path)
         self.actionCompleted.emit('enriching lexicon')
         if stop_check():
             call_back('Resetting lexicon...')
             call_back(0, 0)
             c.reset_lexicon()
             
             return False
     return True
Beispiel #7
0
    def run_query(self):
        print("in the lexical worker")
        config = self.kwargs['config']
        case_sensitive = self.kwargs['case_sensitive']
        path = self.kwargs['path']
        stop_check = self.kwargs['stop_check']
        call_back = self.kwargs['call_back']
        call_back('Enriching lexicon...')
        call_back(0, 0)
        with CorpusContext(config) as c:
            enrich_lexicon_from_csv(c, path)
            self.actionCompleted.emit('enriching lexicon')
            if stop_check():
                call_back('Resetting lexicon...')
                call_back(0, 0)
                c.reset_lexicon()

                return False
        return True
Beispiel #8
0
def duration_export(config,
                    corpus_name,
                    corpus_directory,
                    dialect_code,
                    speakers,
                    vowels,
                    stressed_vowels=None,
                    baseline=False,
                    ignored_speakers=None):
    csv_path = os.path.join(base_dir, corpus_name,
                            '{}_duration.csv'.format(corpus_name))

    with CorpusContext(config) as c:

        if corpus_name == 'spade-Buckeye':
            print("Processing {}".format(corpus_name))
            if not c.hierarchy.has_type_property('word',
                                                 "ContainsVowelObstruent"):
                print('Classifying Buckeye vowel-obstruent pairs')
                enrich_lexicon_from_csv(
                    c,
                    os.path.join(
                        corpus_directory,
                        "corpus-data/enrichment/buckeye_obstruents.csv"))

        print("Beginning duration export")
        beg = time.time()

        consonants = [
            'p', 'P', 't', 'T', 'k', 'K', 'b', 'B', 'd', 'D', 'g', 'G', 'F',
            'f', 'V', 'v', 'N', 'n', 'm', 'M', 'NG', 'TH', 'DH', 'l', 'L',
            'ZH', 'x', 'X', 'r', 'R', 's', 'S', 'sh', 'SH', 'z', 'Z', 'zh',
            'ZH', 'J', 'C', 'tS', 'dZ', 'tq'
        ]
        if stressed_vowels:
            q = c.query_graph(c.phone).filter(
                c.phone.label.in_(stressed_vowels))
            q = q.filter(c.phone.following.end == c.phone.syllable.end)
            q = q.filter(
                c.phone.following.end == c.phone.syllable.word.utterance.end)
            q = q.filter(c.phone.following.label.in_(consonants))
            q = q.filter(c.phone.syllable.word.num_syllables == 1)
        else:
            q = c.query_graph(c.phone).filter(c.phone.label.in_(vowels))
            q = q.filter(c.phone.following.end == c.phone.syllable.end)
            q = q.filter(
                c.phone.following.end == c.phone.syllable.word.utterance.end)
            q = q.filter(c.phone.following.label.in_(consonants))
            q = q.filter(c.phone.word.stresspattern == "1")
            q = q.filter(c.phone.syllable.stress == "1")

        print(c.hierarchy)
        if c.hierarchy.has_type_property('word', 'containsvowelobstruent'):
            q = q.filter(c.phone.word.containsvowelobstruent == True)

        if speakers:
            q = q.filter(c.phone.speaker.name.in_(speakers))

        if ignored_speakers:
            q = q.filter(c.phone.speaker.name.not_in_(ignored_speakers))

        print("Applied filters")
        q = q.columns(
            c.phone.label.column_name('phone_label'),
            c.phone.begin.column_name('phone_begin'),
            c.phone.end.column_name('phone_end'),
            c.phone.duration.column_name('phone_duration'),
            c.phone.previous.label.column_name('previous_phone'),
            c.phone.following.label.column_name('following_phone'),
            c.phone.following.duration.column_name('following_duration'),
            c.phone.word.unisynprimstressedvowel1.column_name('word_unisyn'),
            c.phone.word.label.column_name('word_label'),
            c.phone.word.begin.column_name('word_begin'),
            c.phone.word.end.column_name('word_end'),
            c.phone.word.duration.column_name('word_duration'),
            c.phone.syllable.label.column_name('syllable_label'),
            c.phone.syllable.duration.column_name('syllable_duration'),
            c.phone.word.stresspattern.column_name('word_stresspattern'),
            c.phone.syllable.stress.column_name('syllable_stress'),
            c.phone.utterance.speech_rate.column_name('speech_rate'),
            c.phone.utterance.id.column_name('utterance_label'),
            c.phone.speaker.name.column_name('speaker_name'),
            c.phone.syllable.end.column_name('syllable_end'),
            c.phone.utterance.end.column_name('utterance_end'))
        for sp, _ in c.hierarchy.speaker_properties:
            if sp == 'name':
                continue
            q = q.columns(getattr(c.phone.speaker, sp).column_name(sp))

        if c.hierarchy.has_token_property('word', 'surface_transcription'):
            print('getting underlying and surface transcriptions')
            q = q.columns(
                c.phone.word.transcription.column_name(
                    'word_underlying_transcription'),
                c.phone.word.surface_transcription.column_name(
                    'word_surface_transcription'))

        if c.hierarchy.has_type_property('word', 'containsvowelobstruent'):
            q = q.columns(
                c.phone.word.containsvowelobstruent.column_name(
                    'word_containsvowelobstruent'))

        # get baseline duration:
        # for most corpora this should be done over words
        # as buckeye has many-to-one correspondence between transcriptions and words
        # buckeye should have duration calculated over its underlying transcription
        if baseline:
            if not c.hierarchy.has_type_property('word', 'baseline'):
                print('getting baseline from word')
                c.encode_baseline('word', 'duration')
                q = q.columns(
                    c.phone.word.baseline_duration.column_name(
                        'word_baseline_duration'))

        print("Writing CSV")
        q.to_csv(csv_path)
        end = time.time()
        time_taken = time.time() - beg
        print('Query took: {}'.format(end - beg))
        print("Results for query written to " + csv_path)
        common.save_performance_benchmark(config, 'duration_export',
                                          time_taken)