def test_aclew(project): data = pd.read_csv("tests/data/aclew.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": set, "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "rttm", } for set in ["aclew_vtc", "aclew_alice", "aclew_vcm"]]), import_function=partial(fake_vocs, data), ) aclew = AclewMetrics(project, by="child_id", rec_cols='date_iso', child_cols='experiment,child_dob', vtc='aclew_vtc', alice='aclew_alice', vcm='aclew_vcm') aclew.extract() truth = pd.read_csv("tests/truth/aclew_metrics.csv") pd.testing.assert_frame_equal(aclew.metrics, truth, check_like=True)
def import_annotations(args): """convert and import a set of annotations""" project = ChildProject(args.source) errors, warnings = project.validate_input_data() if len(errors) > 0: print("validation failed, {} error(s) occured".format(len(errors)), file=sys.stderr) sys.exit(1) if args.annotations: annotations = pd.read_csv(args.annotations) else: annotations = pd.DataFrame([{ col.name: getattr(args, col.name) for col in AnnotationManager.INDEX_COLUMNS if not col.generated }]) am = AnnotationManager(project) am.import_annotations(annotations) errors, warnings = am.validate() if len(am.errors) > 0: print("importation completed with {} errors and {} warnings".format( len(am.errors) + len(errors), len(warnings)), file=sys.stderr) print("\n".join(am.errors), file=sys.stderr) print("\n".join(errors), file=sys.stderr) print("\n".join(warnings))
def test_lena(project): data = pd.read_csv("tests/data/lena_its.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "lena_its", "raw_filename": "file.its", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 100000000, "format": "its", }]), import_function=partial(fake_vocs, data), ) lena = LenaMetrics(project, set="lena_its", period='1h', from_time='10:00:00', to_time='16:00:00') lena.extract() truth = pd.read_csv("tests/truth/lena_metrics.csv") pd.testing.assert_frame_equal(lena.metrics, truth, check_like=True)
def test_intersect(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/intersect.csv") am.import_annotations(input_annotations) intersection = AnnotationManager.intersection( am.annotations[am.annotations["set"].isin(["textgrid", "vtc_rttm"])] ).convert_dtypes() a = intersection[intersection["set"] == "textgrid"] b = intersection[intersection["set"] == "vtc_rttm"] columns = a.columns.tolist() columns.remove("imported_at") columns.remove("package_version") columns.remove("merged_from") pd.testing.assert_frame_equal( standardize_dataframe(a, columns), standardize_dataframe( pd.read_csv("tests/truth/intersect_a.csv"), columns ).convert_dtypes(), ) pd.testing.assert_frame_equal( standardize_dataframe(b, columns), standardize_dataframe( pd.read_csv("tests/truth/intersect_b.csv"), columns ).convert_dtypes(), )
def test_specs(project): data = pd.read_csv("tests/data/lena_its.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "specs_its", "raw_filename": "file.its", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 100000000, "format": "its", }]), import_function=partial(fake_vocs, data), ) msp = MetricsSpecificationPipeline() parameters = "tests/data/parameters_metrics.yml" msp.run(parameters) output = pd.read_csv(msp.destination) truth = pd.read_csv("tests/truth/specs_metrics.csv") pd.testing.assert_frame_equal(output, truth, check_like=True) new_params = msp.parameters_path msp.run(new_params) output = pd.read_csv(msp.destination) pd.testing.assert_frame_equal(output, truth, check_like=True)
def test_custom(project): am = AnnotationManager(project) data = pd.read_csv("tests/data/lena_its.csv") am.import_annotations( pd.DataFrame([{ "set": "custom_its", "raw_filename": "file.its", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 100000000, "format": "its", }]), import_function=partial(fake_vocs, data), ) parameters = "tests/data/list_metrics.csv" cmm = CustomMetrics(project, parameters) cmm.extract() truth = pd.read_csv("tests/truth/custom_metrics.csv") pd.testing.assert_frame_equal(cmm.metrics, truth, check_like=True)
def test_random_vocalization(project): segments = [{ 'segment_onset': 1000, 'segment_offset': 2000, 'speaker_type': speaker } for speaker in ['CHI', 'FEM', 'MAL']] segments = pd.DataFrame(segments) am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "random", "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "rttm", }]), import_function=partial(fake_conversation, segments), ) sampler = RandomVocalizationSampler(project=project, annotation_set="random", target_speaker_type=["CHI"], sample_size=1, threads=1) sampler.sample() chi_segments = segments[segments["speaker_type"] == "CHI"] pd.testing.assert_frame_equal( sampler.segments[["segment_onset", "segment_offset"]].astype(int), chi_segments[["segment_onset", "segment_offset"]].astype(int))
def test_import(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/input.csv') am.import_annotations(input_annotations) am.read() assert am.annotations.shape[0] == input_annotations.shape[ 0], "imported annotations length does not match input" assert all([ os.path.exists(os.path.join(project.path, 'annotations', f)) for f in am.annotations['annotation_filename'].tolist() ]), "some annotations are missing" errors, warnings = am.validate() assert len(errors) == 0 and len( warnings) == 0, "malformed annotations detected" for dataset in ['eaf', 'textgrid', 'eaf_solis']: annotations = am.annotations[am.annotations['set'] == dataset] segments = am.get_segments(annotations) segments.drop(columns=annotations.columns, inplace=True) pd.testing.assert_frame_equal( segments.sort_index(axis=1).sort_values( segments.columns.tolist()).reset_index(drop=True), pd.read_csv('tests/truth/{}.csv'.format(dataset)).sort_index( axis=1).sort_values( segments.columns.tolist()).reset_index(drop=True), check_less_precise=True)
def test_conversation_sampler(project): conversations = [ { "onset": 0, "vocs": 5 }, { "onset": 60 * 1000, "vocs": 10 }, { "onset": 1800 * 1000, "vocs": 15 }, ] segments = [] for conversation in conversations: segments += [{ "segment_onset": conversation["onset"] + i * (2000 + 500), "segment_offset": conversation["onset"] + i * (2000 + 500) + 2000, "speaker_type": ["FEM", "CHI"][i % 2], } for i in range(conversation["vocs"])] segments = pd.DataFrame(segments) am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "conv", "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 3600 * 1000 * 1000, "format": "rttm", }]), import_function=partial(fake_conversation, segments), ) sampler = ConversationSampler( project, "conv", count=5, interval=1000, speakers=["FEM", "CHI"], ) sampler.sample() assert len(sampler.segments) == len(conversations) assert sampler.segments["segment_onset"].tolist() == [ conv["onset"] for conv in sorted( conversations, key=lambda c: c["vocs"], reverse=True) ]
def test_rename(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") am.import_annotations(input_annotations[input_annotations["set"] == "textgrid"]) am.read() tg_count = am.annotations[am.annotations["set"] == "textgrid"].shape[0] am.rename_set("textgrid", "renamed") am.read() errors, warnings = am.validate() assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected" assert am.annotations[am.annotations["set"] == "textgrid"].shape[0] == 0 assert am.annotations[am.annotations["set"] == "renamed"].shape[0] == tg_count
def test_import(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") am.import_annotations(input_annotations) am.read() assert ( am.annotations.shape[0] == input_annotations.shape[0] ), "imported annotations length does not match input" assert all( [ os.path.exists( os.path.join( project.path, "annotations", a["set"], "converted", a["annotation_filename"], ) ) for a in am.annotations.to_dict(orient="records") ] ), "some annotations are missing" errors, warnings = am.validate() assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected" errors, warnings = am.read() assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected" for dataset in ["eaf_basic", "textgrid", "eaf_solis"]: annotations = am.annotations[am.annotations["set"] == dataset] segments = am.get_segments(annotations) segments.drop(columns=set(annotations.columns) - {"raw_filename"}, inplace=True) truth = pd.read_csv("tests/truth/{}.csv".format(dataset)) print(segments) print(truth) pd.testing.assert_frame_equal( standardize_dataframe(segments, set(truth.columns.tolist())), standardize_dataframe(truth, set(truth.columns.tolist())), check_less_precise=True, )
def test_vc_stats(project, turntakingthresh): am = AnnotationManager(project) am.import_annotations( pd.read_csv('examples/valid_raw_data/raw_annotations/input.csv')) raw_rttm = 'example_metrics.rttm' segments = am.annotations[am.annotations['raw_filename'] == raw_rttm] vc = am.get_vc_stats(am.get_segments(segments), turntakingthresh=turntakingthresh).reset_index() truth_vc = pd.read_csv( 'tests/truth/vc_truth_{:.1f}.csv'.format(turntakingthresh)) pd.testing.assert_frame_equal( vc.reset_index().sort_index(axis=1).sort_values(vc.columns.tolist()), truth_vc.reset_index().sort_index(axis=1).sort_values( vc.columns.tolist()), atol=3)
def test_clipping(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/input.csv') am.import_annotations(input_annotations) am.read() start = 1981 stop = 1984 segments = am.get_segments( am.annotations[am.annotations['set'] == 'vtc_rttm']) segments = am.clip_segments(segments, start, stop) assert segments['segment_onset'].between( start, stop).all() and segments['segment_offset'].between( start, stop).all(), "segments not properly clipped" assert segments.shape[0] == 2, "got {} segments, expected 2".format( segments.shape[0])
def test_clipping(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") input_annotations = input_annotations[input_annotations["recording_filename"] == "sound.wav"] am.import_annotations(input_annotations[input_annotations["set"] == "vtc_rttm"]) am.read() start = 1981000 stop = 1984000 segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"]) segments = am.clip_segments(segments, start, stop) assert ( segments["segment_onset"].between(start, stop).all() and segments["segment_offset"].between(start, stop).all() ), "segments not properly clipped" assert segments.shape[0] == 2, "got {} segments, expected 2".format( segments.shape[0] )
def test_custom_importation(project): am = AnnotationManager(project) input = pd.DataFrame( [ { "set": "vtc_rttm", "range_onset": 0, "range_offset": 4000, "recording_filename": "sound.wav", "time_seek": 0, "raw_filename": "example.rttm", "format": "custom", } ] ) am.import_annotations(input, import_function=custom_function) am.read() errors, warnings = am.validate() assert len(errors) == 0
def test_metrics_segments(project): data = pd.read_csv("tests/data/aclew.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": set, "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "rttm", } for set in ["segments_vtc", "segments_alice", "segments_vcm"]]), import_function=partial(fake_vocs, data), ) lm = pd.DataFrame(np.array([ ["voc_speaker", "segments_vtc", 'FEM'], ["voc_speaker", "segments_vtc", 'CHI'], ["voc_speaker_ph", "segments_vtc", 'FEM'], ["voc_speaker_ph", "segments_vtc", 'CHI'], ["wc_speaker_ph", "segments_alice", 'FEM'], ["lp_n", "segments_vcm", pd.NA], ["lp_dur", "segments_vcm", pd.NA], ]), columns=["callable", "set", "speaker"]) metrics = Metrics(project, metrics_list=lm, by="segments", rec_cols='date_iso', child_cols='experiment,child_dob', segments='tests/data/segments.csv') metrics.extract() truth = pd.read_csv("tests/truth/segments_metrics.csv") pd.testing.assert_frame_equal(metrics.metrics, truth, check_like=True)
def test_intersect(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/intersect.csv') am.import_annotations(input_annotations) am.read() a, b = am.intersection(am.annotations[am.annotations['set'] == 'textgrid'], am.annotations[am.annotations['set'] == 'vtc_rttm']) pd.testing.assert_frame_equal( a.sort_index(axis=1).sort_values(a.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']), pd.read_csv('tests/truth/intersect_a.csv').sort_index( axis=1).sort_values(a.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at'])) pd.testing.assert_frame_equal( b.sort_index(axis=1).sort_values(b.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']), pd.read_csv('tests/truth/intersect_b.csv').sort_index( axis=1).sort_values(b.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']))
def test_periodic(project): """ os.makedirs('output/eaf', exist_ok = True) project = ChildProject('examples/valid_raw_data') project.read() am = AnnotationManager(project) am.read() """ data = pd.read_csv("tests/data/eaf_segments.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "vtc", "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "vtc_rttm", }]), import_function=partial(fake_vocs, data), ) sampler = PeriodicSampler(project, 500, 500, 250, recordings=['sound.wav']) sampler.sample() sampler.segments.to_csv('output/eaf/segments.csv') ranges = sampler.segments.rename(columns={ "segment_onset": "range_onset", "segment_offset": "range_offset", }) annotations = am.get_within_ranges(ranges, [IMP_FROM], 'warn') #annotations = am.annotations[am.annotations["set"] == IMP_FROM].drop_duplicates(['set', 'recording_filename', 'time_seek', 'range_onset', 'range_offset', 'raw_filename', 'format', 'filter'],ignore_index=True) annot_segments = am.get_segments(annotations) eaf_builder = EafBuilderPipeline() eaf_builder.run( destination='output/eaf', segments='output/eaf/segments.csv', eaf_type='periodic', template='basic', context_onset=250, context_offset=250, path='output/eaf', import_speech_from='vtc', ) eaf = Eaf('output/eaf/sound/sound_periodic_basic.eaf') code = eaf.tiers['code_periodic'][0] segments = [] for pid in code: (start_ts, end_ts, value, svg_ref) = code[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), sampler.segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset' ]).reset_index(drop=True)) segments = [] vtc_speech = eaf.tiers['VTC-SPEECH'][0] for pid in vtc_speech: (start_ts, end_ts, value, svg_ref) = vtc_speech[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) speech_segs = annot_segments[pd.isnull(annot_segments['speaker_type'])] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), speech_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_chi = eaf.tiers['VTC-CHI'][0] for pid in vtc_chi: (start_ts, end_ts, value, svg_ref) = vtc_chi[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) chi_segs = annot_segments[annot_segments['speaker_type'] == 'CHI'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), chi_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_och = eaf.tiers['VTC-OCH'][0] for pid in vtc_och: (start_ts, end_ts, value, svg_ref) = vtc_och[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) och_segs = annot_segments[annot_segments['speaker_type'] == 'OCH'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), och_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_fem = eaf.tiers['VTC-FEM'][0] for pid in vtc_fem: (start_ts, end_ts, value, svg_ref) = vtc_fem[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) fem_segs = annot_segments[annot_segments['speaker_type'] == 'FEM'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), fem_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) assert eaf.media_descriptors[0]['MEDIA_URL'] == 'sound.wav'
def test_merge(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") input_annotations = input_annotations[ input_annotations["set"].isin(["vtc_rttm", "alice"]) ] print(input_annotations) am.import_annotations(input_annotations) am.read() print(am.annotations) am.read() am.merge_sets( left_set="vtc_rttm", right_set="alice", left_columns=["speaker_type"], right_columns=["phonemes", "syllables", "words"], output_set="alice_vtc", full_set_merge = False, recording_filter = {'sound.wav'} ) am.read() anns = am.annotations[am.annotations['set'] == 'alice_vtc'] assert anns.shape[0] == 1 assert anns.iloc[0]['recording_filename'] == 'sound.wav' time.sleep(2) #sleeping for 2 seconds to have different 'imported_at' values so that can make sure both merge did fine am.merge_sets( left_set="vtc_rttm", right_set="alice", left_columns=["speaker_type"], right_columns=["phonemes", "syllables", "words"], output_set="alice_vtc", full_set_merge = False, skip_existing = True ) am.read() anns = am.annotations[am.annotations['set'] == 'alice_vtc'] assert anns.shape[0] == 2 assert set(anns['recording_filename'].unique()) == {'sound.wav','sound2.wav'} assert anns.iloc[0]['imported_at'] != anns.iloc[1]['imported_at'] segments = am.get_segments(am.annotations[am.annotations["set"] == "alice_vtc"]) vtc_segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"]) assert segments.shape[0] == vtc_segments.shape[0] assert segments.shape[1] == vtc_segments.shape[1] + 3 adult_segments = ( segments[segments["speaker_type"].isin(["FEM", "MAL"])] .sort_values(["segment_onset", "segment_offset"]) .reset_index(drop=True) ) alice = ( am.get_segments(am.annotations[am.annotations["set"] == "alice"]) .sort_values(["segment_onset", "segment_offset"]) .reset_index(drop=True) ) pd.testing.assert_frame_equal( adult_segments[["phonemes", "syllables", "words"]], alice[["phonemes", "syllables", "words"]], )
import argparse import os parser = argparse.ArgumentParser( description='import and convert VTC annotations into the project') parser.add_argument("--source", help="project path", required=True) parser.add_argument("--overwrite", help="project path", dest='overwrite', action='store_true') args = parser.parse_args() project = ChildProject(args.source) am = AnnotationManager(project) if args.overwrite: am.remove_set('vtc') input = project.recordings[['filename']] input.rename(columns={'filename': 'recording_filename'}, inplace=True) input = input[input['recording_filename'] != 'NA'] input['set'] = 'vtc' input['time_seek'] = 0 input['range_onset'] = 0 input['range_offset'] = 0 input['raw_filename'] = input['recording_filename'].apply( lambda s: os.path.join('vtc', s + '.rttm')) input['format'] = 'vtc_rttm' am.import_annotations(input)
#!/usr/bin/env python3 from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager import argparse import os parser = argparse.ArgumentParser(description='import and convert VTC annotations into the project') parser.add_argument("--source", help = "project path", required = True) parser.add_argument("--set", help = "annotation set. the rttm files should lie in <source>/annotations/<set>/raw/", default = 'vtc') parser.add_argument("--overwrite", help = "project path", dest = 'overwrite', action = 'store_true') args = parser.parse_args() project = ChildProject(args.source) am = AnnotationManager(project) if args.overwrite: am.remove_set(args.set) input = project.recordings[['recording_filename', 'duration']] input = input[input['recording_filename'] != 'NA'] input['set'] = args.set input['time_seek'] = 0 input['range_onset'] = 0 input['range_offset'] = input['duration'] input['raw_filename'] = input['recording_filename'].apply(lambda s: os.path.splitext(s)[0] + '.rttm') input['format'] = 'vtc_rttm' am.import_annotations(input, threads = 4)