def test_word_mean_duration(summarized_config): with CorpusContext(summarized_config) as g: print("mean duration (word):") res = g.get_measure('duration', 'mean', 'word') print(res) assert (len(res) == 44) for i, r in enumerate(res): if r[0] == 'words': break assert res[i][1] == approx(0.5340040000000001, 1e-3)
def test_ilg_mismatched(graph_db, ilg_test_dir): mismatched_path = os.path.join(ilg_test_dir, 'mismatched.txt') basic_path = os.path.join(ilg_test_dir, 'basic.txt') parser = inspect_ilg(basic_path) with CorpusContext('mismatch', **graph_db) as c: c.reset() with pytest.raises(ILGWordMismatchError): c.load(parser, mismatched_path)
def test_strings(timed_config): with CorpusContext(timed_config) as g: q = g.query_graph(g.word).filter(g.word.label == 'are') q = q.columns(g.word.label.column_name('label'), g.word.phone.label.column_name('phones')) print(q.cypher()) results = q.all() assert (all(x['label'] == 'are' for x in results)) assert (all(x['phones'] == ['aa', 'r'] for x in results))
def run_query(self): config = self.kwargs['config'] acoustics = self.kwargs['acoustics'] with CorpusContext(config) as c: acoustic_analysis(c, stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back'], acoustics=acoustics) self.actionCompleted.emit('analysing acousics') return True
def test_baseline_syllable(acoustic_config): syllabics = [ 'ae', 'aa', 'uw', 'ay', 'eh', 'ih', 'aw', 'ey', 'iy', 'uh', 'ah', 'ao', 'er', 'ow' ] with CorpusContext(acoustic_config) as g: g.encode_syllabic_segments(syllabics) g.encode_syllables() res = g.get_measure('duration', 'baseline', 'syllable') print(res)
def test_phone_mean_duration_speaker(summarized_config): with CorpusContext(summarized_config) as g: print("phone mean:") res = g.get_measure('duration', 'mean', 'phone', False, 'unknown') print(res) assert (len(res) == 33) for i, r in enumerate(res): if r[0] == 'uw': break assert res[i][1] == approx(0.08043999999999973, 1e-3)
def test_baseline_word(summarized_config): with CorpusContext(summarized_config) as g: g.reset_pauses() g.reset_syllables() g.reset_utterances() res = g.get_measure('duration', 'baseline', 'word') print(res) assert res['this'] == approx(0.20937191666666685, 1e-3) assert (len(res) == 44)
def test_word_mean_duration_with_speaker_buckeye(graph_db, buckeye_test_dir): with CorpusContext('directory_buckeye', **graph_db) as g: g.encode_utterances() res = g.get_measure('duration', 'mean', 'word', True) print(res) for i, r in enumerate(res): if r[1] == 'that\'s': break assert (len(res) == 9) assert res[i][2] == approx(0.17431200000000002, 1e-3)
def acoustic_config(graph_db, textgrid_test_dir): config = CorpusConfig('acoustic', **graph_db) acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_textgrid(acoustic_path) c.load(parser, acoustic_path) #c.analyze_acoustics() return config
def test_relativize_intensity(acoustic_utt_config): with CorpusContext(acoustic_utt_config) as g: mean_f0 = 97.72 sd_f0 = 1.88997 expected_intensity = { Decimal('4.23'): { 'Intensity': 98, 'Intensity_relativized': (98 - mean_f0) / sd_f0 }, Decimal('4.24'): { 'Intensity': 100, 'Intensity_relativized': (100 - mean_f0) / sd_f0 }, Decimal('4.25'): { 'Intensity': 99, 'Intensity_relativized': (99 - mean_f0) / sd_f0 }, Decimal('4.26'): { 'Intensity': 95.8, 'Intensity_relativized': (95.8 - mean_f0) / sd_f0 }, Decimal('4.27'): { 'Intensity': 95.8, 'Intensity_relativized': (95.8 - mean_f0) / sd_f0 } } g.relativize_acoustic_measure('intensity', by_speaker=True) q = g.query_graph(g.phone) q = q.filter(g.phone.label == 'ow') q = q.order_by(g.phone.begin.column_name('begin')) ac = g.phone.intensity q = q.columns(g.phone.label, ac.track) results = q.all() assert (len(results[0].track) == len(expected_intensity.items())) print(sorted(expected_intensity.items())) print(results[0].track) for point in results[0].track: print(point) assert (round(point['Intensity_relativized'], 5) == round( expected_intensity[point.time]['Intensity_relativized'], 5)) g.reset_relativized_acoustic_measure('intensity') assert g.hierarchy.acoustic_properties['intensity'] == {('Intensity', float)} q = g.query_graph(g.phone) q = q.filter(g.phone.label == 'ow') q = q.order_by(g.phone.begin.column_name('begin')) ac = g.phone.intensity q = q.columns(g.phone.label, ac.track) results = q.all() assert len(results[0].track) == 5 for r in results: for p in r.track: assert not p.has_value('Intensity_relativized')
def test_position_query(timed_config): with CorpusContext(timed_config) as g: q = g.query_graph(g.phone).filter(g.phone.label == 'k') q = q.columns(g.phone.word.phone.position.column_name('position')) q = q.order_by(g.phone.word.begin) print(q.cypher()) results = q.all() expected = [1, 1] assert (len(results) == len(expected)) for i in range(len(expected)): assert (results[i]['position'] == expected[i])
def test_reset_utterances(acoustic_utt_config): with CorpusContext(acoustic_utt_config) as g: g.reset_utterances() g.encode_utterances(0.15) q = g.query_graph(g.phone).filter(g.phone.label == 'ow') q = q.columns(g.phone.begin, g.phone.end, g.phone.pitch.track) print(q.cypher()) results = q.all() assert (len(results) > 0) for r in results: assert len(r.track)
def test_query_count_group_by(acoustic_config): with CorpusContext(acoustic_config) as g: q = g.query_graph(g.phone).filter(g.phone.label.in_(['aa','ae'])) results = q.group_by(g.phone.label.column_name('label')).aggregate(Count()) assert(len(results) == 2) print(results) assert(results[0]['label'] == 'aa') assert(results[0]['count_all'] == 3) assert(results[1]['label'] == 'ae') assert(results[1]['count_all'] == 7)
def updateConfig(self, config): self.config = config self.discourseList.clear() if self.config is None or self.config.corpus_name == '': return try: with CorpusContext(self.config) as c: for d in sorted(c.discourses): self.discourseList.addItem(d) except GraphQueryError: self.discourseList.clear()
def test_phone_std_dev(summarized_config): with CorpusContext(summarized_config) as g: print("phone std dev:") res = g.get_measure('duration', 'stdev', 'phone') print(res) for i, r in enumerate(res): if r[0] == 'uw': break assert (len(res) == 33) assert (abs(res[i][1] - 0.026573072836990105) < .0000000000001)
def test_phone_mean_duration_with_speaker(summarized_config): with CorpusContext(summarized_config) as g: print("phone mean by speaker:") # res =g.phone_mean_duration_with_speaker() res = g.get_measure('duration', 'mean', 'phone', True) print(res) assert (len(res) == 33) for i, r in enumerate(res): if r[1] == 'uw': break assert res[i][2] == approx(0.08043999999999973, 1e-3)
def test_word_std_dev(summarized_config): with CorpusContext(summarized_config) as g: print("word std dev:") res = g.get_measure('duration', 'stdev', 'word') print(res) assert (len(res) == 44) for i, r in enumerate(res): if r[0] == 'words': break assert (abs(res[i][1] - 0.26996736762060747) < .0000000000001)
def test_phone_mean_duration_speaker(summarized_config): with CorpusContext(summarized_config) as g: print("phone mean:") g.encode_utterances() res = g.get_measure('duration', 'mean', 'phone', False, 'unknown') print(res) assert (len(res) == 33) for i, r in enumerate(res): if r[0] == 'uw': break assert (abs(res[i][1] - 0.08043999999999973) < .0000000000001)
def test_order_by(timed_config): with CorpusContext(timed_config) as g: q = g.query_graph(g.word).filter(g.word.label == 'are').order_by( g.word.begin.column_name('begin')) #.times('begin','end') prev = 0 print(q.cypher()) print(q.all()) for x in q.all(): assert (x['begin'] > prev) prev = x['begin'] assert ('timed' in get_corpora_list(timed_config))
def export(request, corpus): response = HttpResponse(content_type='text/csv') a_type = request.data['annotation_type'] corpus = Corpus.objects.get(pk=corpus) ordering = request.data.get('ordering', '') filters = request.data['filters'] columns = request.data['columns'] response[ 'Content-Disposition'] = 'attachment; filename="{}_query_export.csv"'.format( a_type) print(filters) print(columns) with CorpusContext(corpus.config) as c: a = getattr(c, a_type) q = c.query_graph(a) for k, v in filters.items(): if v[0] == '': continue if v[0] == 'null': v = None else: try: v = float(v[0]) except ValueError: v = v[0] k = k.split('__') att = a for f in k: att = getattr(att, f) q = q.filter(att == v) if ordering: desc = False if ordering.startswith('-'): desc = True ordering = ordering[1:] ordering = ordering.split('.') att = a for o in ordering: att = getattr(att, o) q = q.order_by(att, desc) else: q = q.order_by(getattr(a, 'label')) columns_for_export = [] for c in columns: att = a for f in c.split('__'): att = getattr(att, f) columns_for_export.append(att) q = q.columns(*columns_for_export) writer = csv.writer(response) q.to_csv(writer) return response
def test_query_time(timed_config): with CorpusContext(timed_config) as g: q = g.query_graph(g.word).filter(g.word.label == 'are') q = q.filter(g.word.begin > 2) print(q.cypher()) assert (len(list(q.all())) == 1) q = g.query_graph(g.word).filter(g.word.label == 'are') q = q.filter(g.word.begin < 2) print(q.cypher()) assert (len(list(q.all())) == 1)
def test_export_spelling(graph_db, export_test_dir): export_path = os.path.join(export_test_dir, 'export_spelling.txt') with CorpusContext('spelling_no_ignore', **graph_db) as c: export_discourse_spelling(c, 'text_spelling', export_path, words_per_line=10) with open(export_path, 'r') as f: assert (f.read() == 'ab cab\'d ad ab ab.')
def test_query_duration_aggregate_average_group_by(acoustic_config): with CorpusContext(acoustic_config) as g: q = g.query_graph(g.phone).filter(g.phone.label.in_(['aa','ae'])) results = q.group_by(g.phone.label.column_name('label')).aggregate(Average(g.phone.duration)) assert(len(results) == 2) assert(results[0]['label'] == 'aa') assert(abs(results[0]['average_duration'] - 0.08) < 0.001) assert(results[1]['label'] == 'ae') assert(abs(results[1]['average_duration'] - 0.193) < 0.001)
def test_discourse_query(timed_config): with CorpusContext(timed_config) as g: q = g.query_graph(g.word).columns( g.word.discourse.name.column_name('discourse')) print(q.cypher()) assert (all(x['discourse'] == 'test_timed' for x in q.all())) q = g.query_graph(g.word).filter(g.word.discourse.name == 'test') q = q.columns(g.word.discourse.name.column_name('discourse')) print(q.cypher()) assert (all(x['discourse'] == 'test_timed' for x in q.all()))
def reset(corpus_name): """Remove the database files produced from import.""" with ensure_local_database_running(corpus_name, port=8080, ip=server_ip, token=load_token()) as params: config = CorpusConfig(corpus_name, **params) with CorpusContext(config) as c: print('Resetting the corpus.') c.reset()
def test_analyze_pitch_gendered_praat(acoustic_utt_config, praat_path): with CorpusContext(acoustic_utt_config) as g: g.reset_acoustics() g.reset_acoustics() g.config.praat_path = praat_path g.config.pitch_algorithm = 'speaker_adjusted' g.analyze_pitch(source='praat') assert (g.has_pitch('acoustic_corpus')) g.reset_pitch() assert not g.has_pitch(g.discourses[0])
def test_load(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'phone_word.TextGrid') with CorpusContext('test_textgrid', **graph_db) as c: c.reset() parser = inspect_textgrid(path) parser.annotation_tiers[1].linguistic_type = 'word' parser.annotation_tiers[2].ignored = True parser.hierarchy['word'] = None parser.hierarchy['phone'] = 'word' print([(x.linguistic_type, x.name) for x in parser.annotation_tiers]) c.load(parser, path)
def test_subset_enrichment(acoustic_config): syllabics = [ 'ae', 'aa', 'uw', 'ay', 'eh', 'ih', 'aw', 'ey', 'iy', 'uh', 'ah', 'ao', 'er', 'ow' ] phone_class = ['ae', 'aa', 'd', 'r'] with CorpusContext(acoustic_config) as c: c.reset_class('syllabic') c.reset_class('test') c.encode_class(syllabics, "syllabic") c.encode_class(phone_class, "test") assert (len(c.hierarchy.subset_types['phone']) == 2)
def test_load_pronunciation_ignore(textgrid_test_dir, graph_db): path = os.path.join(textgrid_test_dir, 'pronunc_variants_corpus.TextGrid') with CorpusContext('test_pronunc', **graph_db) as c: c.reset() parser = inspect_textgrid(path) parser.annotation_tiers[1].ignored = True parser.annotation_tiers[2].ignored = True c.load(parser, path) with pytest.raises(GraphQueryError): q = c.query_graph(c.actualPron) results = q.all()
def test_load_transcription_morpheme(graph_db, text_transcription_test_dir): transcription_morphemes_path = os.path.join( text_transcription_test_dir, 'text_transcription_morpheme_boundaries.txt') parser = inspect_transcription(transcription_morphemes_path) parser.annotation_types[0].morph_delimiters = set('-=') with CorpusContext('transcription_morpheme', **graph_db) as c: c.reset() c.load(parser, transcription_morphemes_path) #assert(c.lexicon['cab'].frequency == 2) assert (str(c.lexicon['cab'].transcription) == 'c.a.b')