def test_word_transcription(graph_db, textgrid_test_dir): with CorpusContext("discourse_textgrid", **graph_db) as c: c.reset() path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') parser = inspect_textgrid(path) c.load(parser, path) assert (c.hierarchy.has_type_property('word', 'transcription'))
def test_load_pronunciation(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'pronunc_variants_corpus.TextGrid') with CorpusContext('test_pronunc', **graph_db) as c: c.reset() parser = inspect_textgrid(path) c.load(parser, path)
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host = 'localhost', graph_port = 7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) elif name == 'partitur': parser = inspect_partitur(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back = self.kwargs['call_back'], stop_check = self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) self.actionCompleted.emit('importing corpus') return could_not_parse
def test_directory(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'phone_word.TextGrid') with CorpusContext('test_textgrid_directory', **graph_db) as c: c.reset() parser = inspect_textgrid(path) unparsed = c.load(parser, textgrid_test_dir) assert (len(unparsed) > 0)
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host='localhost', graph_port=7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back=self.kwargs['call_back'], stop_check=self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) return could_not_parse
def test_directory(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'phone_word.TextGrid') with CorpusContext('test_textgrid_directory', **graph_db) as c: c.reset() parser = inspect_textgrid(path) unparsed = c.load(parser, textgrid_test_dir) assert(len(unparsed) > 0)
def acoustic_config(graph_db, textgrid_test_dir): config = CorpusConfig("acoustic", **graph_db) acoustic_path = os.path.join(textgrid_test_dir, "acoustic_corpus.TextGrid") with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(acoustic_path) c.load(parser, acoustic_path) return config
def acoustic_config(graph_db, textgrid_test_dir): config = CorpusConfig('acoustic', **graph_db) acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(acoustic_path) c.load(parser, acoustic_path) #c.analyze_acoustics() return config
def summarized_config(graph_db, textgrid_test_dir): config = CorpusConfig('summarized', **graph_db) acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(acoustic_path) c.load(parser, acoustic_path) return config
def test_load(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'phone_word.TextGrid') with CorpusContext('test_textgrid', **graph_db) as c: parser = inspect_textgrid(path) parser.annotation_types[1].linguistic_type = 'word' parser.annotation_types[2].ignored = True parser.hierarchy['word'] = None parser.hierarchy['phone'] = 'word' print([(x.linguistic_type, x.name) for x in parser.annotation_types]) c.load(parser, path)
def acoustic_config(graph_db, textgrid_test_dir): config = CorpusConfig('acoustic', **graph_db) acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(acoustic_path) c.load(parser, acoustic_path) config.pitch_algorithm = 'acousticsim' config.formant_source = 'acousticsim' return config
def acoustic_config(graph_db, textgrid_test_dir): config = CorpusConfig('acoustic', **graph_db) acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(acoustic_path) c.load(parser, acoustic_path) config.pitch_algorithm = 'acousticsim' config.formant_algorithm = 'acousticsim' return config
def test_load(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'phone_word.TextGrid') with CorpusContext('test_textgrid', **graph_db) as c: c.reset() parser = inspect_textgrid(path) parser.annotation_tiers[1].linguistic_type = 'word' parser.annotation_tiers[2].ignored = True parser.hierarchy['word'] = None parser.hierarchy['phone'] = 'word' print([(x.linguistic_type, x.name) for x in parser.annotation_tiers]) c.load(parser, path)
def test_load_pronunciation_ignore(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'pronunc_variants_corpus.TextGrid') with CorpusContext('test_pronunc', **graph_db) as c: c.reset() parser = inspect_textgrid(path) parser.annotation_tiers[1].ignored = True parser.annotation_tiers[2].ignored = True c.load(parser, path) with pytest.raises(GraphQueryError): q = c.query_graph(c.actualPron) results = q.all()
def test_load_pronunciation_ignore(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'pronunc_variants_corpus.TextGrid') with CorpusContext('test_pronunc', **graph_db) as c: c.reset() parser = inspect_textgrid(path) parser.annotation_types[1].ignored = True parser.annotation_types[2].ignored = True c.load(parser, path) with pytest.raises(GraphQueryError): q = c.query_graph(c.actualPron) results = q.all()
def french_config(graph_db, textgrid_test_dir): config = CorpusConfig('french', **graph_db) french_path = os.path.join(textgrid_test_dir, 'FR001_5.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(french_path) c.load(parser, french_path) c.encode_pauses(['sil', '<SIL>']) c.encode_utterances(min_pause_length=.15) return config
def test_utterance_nosilence(graph_db, textgrid_test_dir): tg_path = os.path.join(textgrid_test_dir, 'phone_word_no_silence.TextGrid') with CorpusContext('word_phone_nosilence', **graph_db) as g: g.reset() parser = inspect_textgrid(tg_path) parser.annotation_types[0].linguistic_type = 'phone' parser.annotation_types[1].linguistic_type = 'word' parser.hierarchy['word'] = None parser.hierarchy['phone'] = 'word' g.load(parser, tg_path) g.encode_utterances() q = g.query_graph(g.word).filter(g.word.label == 'b') q = q.columns(g.word.following.label.column_name('following_word')) print(q.cypher()) results = q.all() assert (len(results) == 1) assert (results[0]['following_word'] is None) q = g.query_graph( g.word).filter(g.word.begin == g.word.utterance.begin) results = q.all() assert (len(results) == 1) assert (results[0]['label'] == 'a') q = g.query_graph( g.phone).filter(g.phone.begin == g.phone.utterance.begin) results = q.all() assert (len(results) == 1) assert (results[0]['label'] == 'a') # Things like g.phone.word.following are currently broken in PolyglotDB return q = g.query_graph(g.phone).filter(g.phone.label == 'b') q = q.filter(g.phone.following.label == 'b') q = q.columns( g.phone.label, g.phone.id, g.phone.word.following.label.column_name('following_word')) print(q.cypher()) results = q.all() assert (len(results) == 1) assert (results[0]['following_word'] is None)
def acoustic_utt_config(graph_db, textgrid_test_dir): config = CorpusConfig('acoustic utt', **graph_db) acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(acoustic_path) c.load(parser, acoustic_path) c.encode_pauses(['sil']) c.encode_utterances(min_pause_length=0) config.pitch_algorithm = 'acousticsim' config.formant_algorithm = 'acousticsim' return config
def acoustic_utt_config(graph_db, textgrid_test_dir): config = CorpusConfig('acoustic utt', **graph_db) acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(acoustic_path) c.load(parser, acoustic_path) c.encode_pauses(['sil']) c.encode_utterances(min_pause_length = 0) config.pitch_algorithm = 'acousticsim' config.formant_algorithm = 'acousticsim' return config
def test_utterance_nosilence(graph_db, textgrid_test_dir): tg_path = os.path.join(textgrid_test_dir, 'phone_word_no_silence.TextGrid') with CorpusContext('word_phone_nosilence', **graph_db) as g: g.reset() parser = inspect_textgrid(tg_path) parser.annotation_types[0].linguistic_type = 'phone' parser.annotation_types[1].linguistic_type = 'word' parser.hierarchy['word'] = None parser.hierarchy['phone'] = 'word' g.load(parser, tg_path) g.encode_utterances() q = g.query_graph(g.word).filter(g.word.label == 'b') q = q.columns(g.word.following.label.column_name('following_word')) print(q.cypher()) results = q.all() assert(len(results) == 1) assert(results[0]['following_word'] is None) q = g.query_graph(g.word).filter(g.word.begin == g.word.utterance.begin) results = q.all() assert(len(results) == 1) assert(results[0]['label'] == 'a') q = g.query_graph(g.phone).filter(g.phone.begin == g.phone.utterance.begin) results = q.all() assert(len(results) == 1) assert(results[0]['label'] == 'a') #Things like g.phone.word.following are currently broken in PolyglotDB return q = g.query_graph(g.phone).filter(g.phone.label == 'b') q = q.filter(g.phone.following.label == 'b') q = q.columns(g.phone.label,g.phone.id,g.phone.word.following.label.column_name('following_word')) print(q.cypher()) results = q.all() assert(len(results) == 1) assert(results[0]['following_word'] is None)
def test_load_pronunciation(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'pronunc_variants_corpus.TextGrid') with CorpusContext('test_pronunc', **graph_db) as c: c.reset() parser = inspect_textgrid(path) parser.annotation_types[2].type_property = False c.load(parser, path) q = c.query_graph(c.words).filter(c.words.label == 'probably') q = q.order_by(c.words.begin) q = q.columns(c.words.label, c.words.dictionaryPron.column_name('dict_pron'), c.words.actualPron.column_name('act_pron')) results = q.all() assert(results[0]['dict_pron'] == 'p.r.aa.b.ah.b.l.iy') assert(results[0]['act_pron'] == 'p.r.aa.b.ah.b.l.iy')
def test_load_pronunciation(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'pronunc_variants_corpus.TextGrid') with CorpusContext('test_pronunc', **graph_db) as c: c.reset() parser = inspect_textgrid(path) parser.annotation_tiers[2].type_property = False c.load(parser, path) q = c.query_graph(c.word).filter(c.word.label == 'probably') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.dictionaryPron.column_name('dict_pron'), c.word.actualPron.column_name('act_pron')) results = q.all() assert (results[0]['dict_pron'] == 'p.r.aa.b.ah.b.l.iy') assert (results[0]['act_pron'] == 'p.r.aa.b.ah.b.l.iy')
def test_utterance_oneword(graph_db, textgrid_test_dir): tg_path = os.path.join(textgrid_test_dir, 'one_word_no_silence.TextGrid') with CorpusContext('one_word_no_silence', **graph_db) as g: g.reset() parser = inspect_textgrid(tg_path) parser.annotation_types[0].linguistic_type = 'phone' parser.annotation_types[1].linguistic_type = 'word' parser.hierarchy['word'] = None parser.hierarchy['phone'] = 'word' g.load(parser, tg_path) g.encode_utterances() q = g.query_graph(g.utterance) res = q.all() assert (res[0].begin == 0)
def test_utterance_oneword(graph_db, textgrid_test_dir): tg_path = os.path.join(textgrid_test_dir, 'one_word_no_silence.TextGrid') with CorpusContext('one_word_no_silence', **graph_db) as g: g.reset() parser = inspect_textgrid(tg_path) parser.annotation_types[0].linguistic_type = 'phone' parser.annotation_types[1].linguistic_type = 'word' parser.hierarchy['word'] = None parser.hierarchy['phone'] = 'word' g.load(parser, tg_path) g.encode_utterances() q = g.query_graph(g.utterance) res = q.all() assert(res[0].begin == 0)
def test_load_discourse(graph_db, mfa_test_dir, textgrid_test_dir): test_file_path = os.path.join(mfa_test_dir, "mfa_test.TextGrid") acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') mfa_parser = inspect_mfa(test_file_path) parser = inspect_textgrid(acoustic_path) with CorpusContext('load_remove_test', **graph_db) as c: c.reset() c.load_discourse(parser, acoustic_path) c.load_discourse(mfa_parser, test_file_path) syllabics = ['ER', 'AE', 'IH', 'EH', 'ae', 'ih', 'er', 'eh'] c.encode_syllabic_segments(syllabics) c.encode_syllables() q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') assert q.count() > 0 q = c.query_graph(c.phone).filter(c.phone.label == 'AE') assert q.count() > 0 q = c.query_lexicon(c.syllable).filter(c.syllable.label == 'JH.ER') assert q.count() > 0 q = c.query_lexicon( c.lexicon_word).filter(c.lexicon_word.label == 'JURASSIC') assert q.count() > 0 q = c.query_lexicon( c.lexicon_phone).filter(c.lexicon_phone.label == 'AE') assert q.count() > 0 q = c.query_lexicon( c.lexicon_phone).filter(c.lexicon_phone.label == 'ae') assert q.count() > 0 q = c.query_lexicon( c.lexicon_syllable).filter(c.lexicon_syllable.label == 'JH.ER') assert q.count() > 0 q = c.query_discourses().filter(c.discourse.name == 'mfa_test') assert q.count() > 0 q = c.query_speakers().filter(c.speaker.name == 'mfa') assert q.count() > 0 d = c.discourse_sound_file('acoustic_corpus') assert os.path.exists(d['consonant_file_path'])
def test_inspect_textgrid_directory(textgrid_test_dir): parser = inspect_textgrid(textgrid_test_dir) assert (len(parser.annotation_tiers) == 4)
'user': '******', 'password': '******' } def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) reset = True if reset: print("Getting annotation types..") parser = pgio.inspect_textgrid(path_to_gp) parser.speaker_parser = FilenameSpeakerParser(5) parser.call_back = print print('Loading corpus...') with CorpusContext('gp_thai', **graph_db) as c: c.reset() beg = time.time() c.load(parser, path_to_gp) end = time.time() print('Time taken: {}'.format(end - beg)) if __name__ == '__main__': with CorpusContext('gp_thai', **graph_db) as g: q = g.query_graph(g.phones).filter(g.phones.label == 'd') print(q.cypher()) print(q.count())
def test_tobi(textgrid_test_dir): path = os.path.join(textgrid_test_dir, 'tobi.TextGrid') parser = inspect_textgrid(path) assert (isinstance(parser.annotation_tiers[0], TobiTier)) assert (isinstance(parser.annotation_tiers[1], OrthographyTier))
def test_tobi(textgrid_test_dir): path = os.path.join(textgrid_test_dir, 'tobi.TextGrid') parser = inspect_textgrid(path) assert(isinstance(parser.annotation_types[0], TobiTier)) assert(isinstance(parser.annotation_types[1], OrthographyTier))
def test_directory(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'phone_word.TextGrid') with CorpusContext('test_textgrid_directory', **graph_db) as c: with pytest.raises(TextGridError): parser = inspect_textgrid(path) c.load(parser, textgrid_test_dir)
def test_inspect_textgrid_directory(textgrid_test_dir): parser = inspect_textgrid(textgrid_test_dir) assert(len(parser.annotation_types) == 4)
graph_db = {'host':'localhost', 'port': 7474, 'user': '******', 'password': '******'} def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) reset = True if reset: print("Getting annotation types..") parser = pgio.inspect_textgrid(path_to_gp) parser.speaker_parser = FilenameSpeakerParser(5) parser.call_back = print print('Loading corpus...') with CorpusContext('gp_thai', **graph_db) as c: c.reset() beg = time.time() c.load(parser, path_to_gp) end = time.time() print('Time taken: {}'.format(end - beg)) if __name__ == '__main__': with CorpusContext('gp_thai', **graph_db) as g: q = g.query_graph(g.phones).filter(g.phones.label == 'd') print(q.cypher())