def test_aclew(project): data = pd.read_csv("tests/data/aclew.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": set, "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "rttm", } for set in ["aclew_vtc", "aclew_alice", "aclew_vcm"]]), import_function=partial(fake_vocs, data), ) aclew = AclewMetrics(project, by="child_id", rec_cols='date_iso', child_cols='experiment,child_dob', vtc='aclew_vtc', alice='aclew_alice', vcm='aclew_vcm') aclew.extract() truth = pd.read_csv("tests/truth/aclew_metrics.csv") pd.testing.assert_frame_equal(aclew.metrics, truth, check_like=True)
def test_lena(project): data = pd.read_csv("tests/data/lena_its.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "lena_its", "raw_filename": "file.its", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 100000000, "format": "its", }]), import_function=partial(fake_vocs, data), ) lena = LenaMetrics(project, set="lena_its", period='1h', from_time='10:00:00', to_time='16:00:00') lena.extract() truth = pd.read_csv("tests/truth/lena_metrics.csv") pd.testing.assert_frame_equal(lena.metrics, truth, check_like=True)
def test_intersect(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/intersect.csv") am.import_annotations(input_annotations) intersection = AnnotationManager.intersection( am.annotations[am.annotations["set"].isin(["textgrid", "vtc_rttm"])] ).convert_dtypes() a = intersection[intersection["set"] == "textgrid"] b = intersection[intersection["set"] == "vtc_rttm"] columns = a.columns.tolist() columns.remove("imported_at") columns.remove("package_version") columns.remove("merged_from") pd.testing.assert_frame_equal( standardize_dataframe(a, columns), standardize_dataframe( pd.read_csv("tests/truth/intersect_a.csv"), columns ).convert_dtypes(), ) pd.testing.assert_frame_equal( standardize_dataframe(b, columns), standardize_dataframe( pd.read_csv("tests/truth/intersect_b.csv"), columns ).convert_dtypes(), )
def test_specs(project): data = pd.read_csv("tests/data/lena_its.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "specs_its", "raw_filename": "file.its", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 100000000, "format": "its", }]), import_function=partial(fake_vocs, data), ) msp = MetricsSpecificationPipeline() parameters = "tests/data/parameters_metrics.yml" msp.run(parameters) output = pd.read_csv(msp.destination) truth = pd.read_csv("tests/truth/specs_metrics.csv") pd.testing.assert_frame_equal(output, truth, check_like=True) new_params = msp.parameters_path msp.run(new_params) output = pd.read_csv(msp.destination) pd.testing.assert_frame_equal(output, truth, check_like=True)
def test_random_vocalization(project): segments = [{ 'segment_onset': 1000, 'segment_offset': 2000, 'speaker_type': speaker } for speaker in ['CHI', 'FEM', 'MAL']] segments = pd.DataFrame(segments) am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "random", "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "rttm", }]), import_function=partial(fake_conversation, segments), ) sampler = RandomVocalizationSampler(project=project, annotation_set="random", target_speaker_type=["CHI"], sample_size=1, threads=1) sampler.sample() chi_segments = segments[segments["speaker_type"] == "CHI"] pd.testing.assert_frame_equal( sampler.segments[["segment_onset", "segment_offset"]].astype(int), chi_segments[["segment_onset", "segment_offset"]].astype(int))
def test_custom(project): am = AnnotationManager(project) data = pd.read_csv("tests/data/lena_its.csv") am.import_annotations( pd.DataFrame([{ "set": "custom_its", "raw_filename": "file.its", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 100000000, "format": "its", }]), import_function=partial(fake_vocs, data), ) parameters = "tests/data/list_metrics.csv" cmm = CustomMetrics(project, parameters) cmm.extract() truth = pd.read_csv("tests/truth/custom_metrics.csv") pd.testing.assert_frame_equal(cmm.metrics, truth, check_like=True)
def import_annotations(args): """convert and import a set of annotations""" project = ChildProject(args.source) errors, warnings = project.validate_input_data() if len(errors) > 0: print("validation failed, {} error(s) occured".format(len(errors)), file=sys.stderr) sys.exit(1) if args.annotations: annotations = pd.read_csv(args.annotations) else: annotations = pd.DataFrame([{ col.name: getattr(args, col.name) for col in AnnotationManager.INDEX_COLUMNS if not col.generated }]) am = AnnotationManager(project) am.import_annotations(annotations) errors, warnings = am.validate() if len(am.errors) > 0: print("importation completed with {} errors and {} warnings".format( len(am.errors) + len(errors), len(warnings)), file=sys.stderr) print("\n".join(am.errors), file=sys.stderr) print("\n".join(errors), file=sys.stderr) print("\n".join(warnings))
def test_segments_timestamps(project): am = AnnotationManager(project) segments = pd.DataFrame( [ { "recording_filename": "sound.wav", "segment_onset": 3600 * 1000, "segment_offset": 3600 * 1000 + 1000, } ] ) segments = am.get_segments_timestamps(segments) truth = pd.DataFrame( [ { "recording_filename": "sound.wav", "segment_onset": 3600 * 1000, "segment_offset": 3600 * 1000 + 1000, "onset_time": datetime.datetime(2020, 4, 20, 9 + 1, 0, 0), "offset_time": datetime.datetime(2020, 4, 20, 9 + 1, 0, 1), } ] ) pd.testing.assert_frame_equal( standardize_dataframe(segments, truth.columns), standardize_dataframe(truth, truth.columns), )
def test_conversation_sampler(project): conversations = [ { "onset": 0, "vocs": 5 }, { "onset": 60 * 1000, "vocs": 10 }, { "onset": 1800 * 1000, "vocs": 15 }, ] segments = [] for conversation in conversations: segments += [{ "segment_onset": conversation["onset"] + i * (2000 + 500), "segment_offset": conversation["onset"] + i * (2000 + 500) + 2000, "speaker_type": ["FEM", "CHI"][i % 2], } for i in range(conversation["vocs"])] segments = pd.DataFrame(segments) am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "conv", "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 3600 * 1000 * 1000, "format": "rttm", }]), import_function=partial(fake_conversation, segments), ) sampler = ConversationSampler( project, "conv", count=5, interval=1000, speakers=["FEM", "CHI"], ) sampler.sample() assert len(sampler.segments) == len(conversations) assert sampler.segments["segment_onset"].tolist() == [ conv["onset"] for conv in sorted( conversations, key=lambda c: c["vocs"], reverse=True) ]
def test_import(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/input.csv') am.import_annotations(input_annotations) am.read() assert am.annotations.shape[0] == input_annotations.shape[ 0], "imported annotations length does not match input" assert all([ os.path.exists(os.path.join(project.path, 'annotations', f)) for f in am.annotations['annotation_filename'].tolist() ]), "some annotations are missing" errors, warnings = am.validate() assert len(errors) == 0 and len( warnings) == 0, "malformed annotations detected" for dataset in ['eaf', 'textgrid', 'eaf_solis']: annotations = am.annotations[am.annotations['set'] == dataset] segments = am.get_segments(annotations) segments.drop(columns=annotations.columns, inplace=True) pd.testing.assert_frame_equal( segments.sort_index(axis=1).sort_values( segments.columns.tolist()).reset_index(drop=True), pd.read_csv('tests/truth/{}.csv'.format(dataset)).sort_index( axis=1).sort_values( segments.columns.tolist()).reset_index(drop=True), check_less_precise=True)
def test_within_ranges(project): am = AnnotationManager(project) annotations = [ { "recording_filename": "sound.wav", "set": "matching", "range_onset": onset, "range_offset": onset + 500, } for onset in np.arange(0, 4000, 500) ] matching_annotations = pd.DataFrame( [ annotation for annotation in annotations if annotation["range_onset"] >= 1000 and annotation["range_offset"] <= 3000 ] ) am.annotations = pd.DataFrame(annotations) ranges = pd.DataFrame( [{"recording_filename": "sound.wav", "range_onset": 1000, "range_offset": 3000}] ) matches = am.get_within_ranges(ranges, ["matching"]) pd.testing.assert_frame_equal( standardize_dataframe(matching_annotations, matching_annotations.columns), standardize_dataframe(matches, matching_annotations.columns), ) ranges["range_offset"] = 5000 exception_caught = False try: matches = am.get_within_ranges(ranges, ["matching"], "raise") except Exception as e: if str(e) == "annotations from set 'matching' do not cover the whole selected range for recording 'sound.wav', 3.000s covered instead of 4.000s": exception_caught = True assert ( exception_caught ), "get_within_ranges should raise an exception when annotations do not fully cover the required ranges"
def test_import(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") am.import_annotations(input_annotations) am.read() assert ( am.annotations.shape[0] == input_annotations.shape[0] ), "imported annotations length does not match input" assert all( [ os.path.exists( os.path.join( project.path, "annotations", a["set"], "converted", a["annotation_filename"], ) ) for a in am.annotations.to_dict(orient="records") ] ), "some annotations are missing" errors, warnings = am.validate() assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected" errors, warnings = am.read() assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected" for dataset in ["eaf_basic", "textgrid", "eaf_solis"]: annotations = am.annotations[am.annotations["set"] == dataset] segments = am.get_segments(annotations) segments.drop(columns=set(annotations.columns) - {"raw_filename"}, inplace=True) truth = pd.read_csv("tests/truth/{}.csv".format(dataset)) print(segments) print(truth) pd.testing.assert_frame_equal( standardize_dataframe(segments, set(truth.columns.tolist())), standardize_dataframe(truth, set(truth.columns.tolist())), check_less_precise=True, )
def test_within_time_range(project): from ChildProject.utils import TimeInterval am = AnnotationManager(project) am.project.recordings = pd.read_csv("tests/data/time_range_recordings.csv") annotations = pd.read_csv("tests/data/time_range_annotations.csv") matches = am.get_within_time_range(annotations, TimeInterval(datetime.datetime(1900,1,1,9,0),datetime.datetime(1900,1,1,20,0))) truth = pd.read_csv("tests/truth/time_range.csv") pd.testing.assert_frame_equal( standardize_dataframe(matches, truth.columns), standardize_dataframe(truth, truth.columns), ) exception_caught = False try: matches = am.get_within_time_range(annotations, "9am", "8pm") except ValueError as e: exception_caught = True assert exception_caught, "no exception was thrown despite invalid times"
def test_metrics_segments(project): data = pd.read_csv("tests/data/aclew.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": set, "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "rttm", } for set in ["segments_vtc", "segments_alice", "segments_vcm"]]), import_function=partial(fake_vocs, data), ) lm = pd.DataFrame(np.array([ ["voc_speaker", "segments_vtc", 'FEM'], ["voc_speaker", "segments_vtc", 'CHI'], ["voc_speaker_ph", "segments_vtc", 'FEM'], ["voc_speaker_ph", "segments_vtc", 'CHI'], ["wc_speaker_ph", "segments_alice", 'FEM'], ["lp_n", "segments_vcm", pd.NA], ["lp_dur", "segments_vcm", pd.NA], ]), columns=["callable", "set", "speaker"]) metrics = Metrics(project, metrics_list=lm, by="segments", rec_cols='date_iso', child_cols='experiment,child_dob', segments='tests/data/segments.csv') metrics.extract() truth = pd.read_csv("tests/truth/segments_metrics.csv") pd.testing.assert_frame_equal(metrics.metrics, truth, check_like=True)
def test_clipping(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/input.csv') am.import_annotations(input_annotations) am.read() start = 1981 stop = 1984 segments = am.get_segments( am.annotations[am.annotations['set'] == 'vtc_rttm']) segments = am.clip_segments(segments, start, stop) assert segments['segment_onset'].between( start, stop).all() and segments['segment_offset'].between( start, stop).all(), "segments not properly clipped" assert segments.shape[0] == 2, "got {} segments, expected 2".format( segments.shape[0])
def test_clipping(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") input_annotations = input_annotations[input_annotations["recording_filename"] == "sound.wav"] am.import_annotations(input_annotations[input_annotations["set"] == "vtc_rttm"]) am.read() start = 1981000 stop = 1984000 segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"]) segments = am.clip_segments(segments, start, stop) assert ( segments["segment_onset"].between(start, stop).all() and segments["segment_offset"].between(start, stop).all() ), "segments not properly clipped" assert segments.shape[0] == 2, "got {} segments, expected 2".format( segments.shape[0] )
def test_set_from_path(project): am = AnnotationManager(project) assert am.set_from_path(os.path.join(project.path, "annotations/set")) == "set" assert am.set_from_path(os.path.join(project.path, "annotations/set/")) == "set" assert ( am.set_from_path(os.path.join(project.path, "annotations/set/subset")) == "set/subset" ) assert ( am.set_from_path(os.path.join(project.path, "annotations/set/subset/converted")) == "set/subset" ) assert ( am.set_from_path(os.path.join(project.path, "annotations/set/subset/raw")) == "set/subset" )
def test_custom_importation(project): am = AnnotationManager(project) input = pd.DataFrame( [ { "set": "vtc_rttm", "range_onset": 0, "range_offset": 4000, "recording_filename": "sound.wav", "time_seek": 0, "raw_filename": "example.rttm", "format": "custom", } ] ) am.import_annotations(input, import_function=custom_function) am.read() errors, warnings = am.validate() assert len(errors) == 0
def test_vc_stats(project, turntakingthresh): am = AnnotationManager(project) am.import_annotations( pd.read_csv('examples/valid_raw_data/raw_annotations/input.csv')) raw_rttm = 'example_metrics.rttm' segments = am.annotations[am.annotations['raw_filename'] == raw_rttm] vc = am.get_vc_stats(am.get_segments(segments), turntakingthresh=turntakingthresh).reset_index() truth_vc = pd.read_csv( 'tests/truth/vc_truth_{:.1f}.csv'.format(turntakingthresh)) pd.testing.assert_frame_equal( vc.reset_index().sort_index(axis=1).sort_values(vc.columns.tolist()), truth_vc.reset_index().sort_index(axis=1).sort_values( vc.columns.tolist()), atol=3)
def test_intersect(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/intersect.csv') am.import_annotations(input_annotations) am.read() a, b = am.intersection(am.annotations[am.annotations['set'] == 'textgrid'], am.annotations[am.annotations['set'] == 'vtc_rttm']) pd.testing.assert_frame_equal( a.sort_index(axis=1).sort_values(a.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']), pd.read_csv('tests/truth/intersect_a.csv').sort_index( axis=1).sort_values(a.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at'])) pd.testing.assert_frame_equal( b.sort_index(axis=1).sort_values(b.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']), pd.read_csv('tests/truth/intersect_b.csv').sort_index( axis=1).sort_values(b.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']))
def run( self, destination: str, segments: str, eaf_type: str, template: str, context_onset: int = 0, context_offset: int = 0, path: str = None, import_speech_from: str = None, **kwargs, ): """generate .eaf templates based on intervals to code. :param path: project path :type path: str :param destination: eaf destination :type destination: str :param segments: path to the input segments dataframe :type segments: str :param eaf_type: eaf-type [random, periodic] :type eaf_type: str :param template: name of the template to use (basic, native, or non-native) :type template: str :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context :type context_onset: int :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context :type context_offset: int """ try: from importlib import resources except ImportError: # TODO: Perhaps add this as a dependency to the resources? import importlib_resources as resources etf_path = "{}.etf".format(template) pfsx_path = "{}.pfsx".format(template) if template in ["basic", "native", "non-native"]: with resources.path("ChildProject.templates", etf_path) as etf: etf_path = str(etf) with resources.path("ChildProject.templates", pfsx_path) as pfsx: pfsx_path = str(pfsx) if not os.path.exists(etf_path): raise Exception("{} cannot be found".format(etf_path)) if not os.path.exists(pfsx_path): raise Exception("{} cannot be found".format(pfsx_path)) print("making the " + eaf_type + " eaf file and csv") segments = pd.read_csv(segments) assert_dataframe("segments", segments, not_empty=True) assert_columns_presence( "segments", segments, {"recording_filename", "segment_onset", "segment_offset"}, ) imported_set = None prefill = path and import_speech_from if prefill: project = ChildProject(path) am = AnnotationManager(project) am.read() imported_set = import_speech_from for recording_filename, segs in segments.groupby("recording_filename"): recording_prefix = os.path.splitext(recording_filename)[0] output_filename = (recording_prefix + "_" + eaf_type + "_" + os.path.basename(template)) # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here. timestamps = [(on, off) for on, off in segs.loc[:, ["segment_onset", "segment_offset"]].values] speech_segments = None imported_format = None if prefill: ranges = segs.assign( recording_filename=recording_filename).rename( columns={ "segment_onset": "range_onset", "segment_offset": "range_offset", }) matches = am.get_within_ranges(ranges, [import_speech_from], 'warn') if len(matches) == 0: continue speech_segments = am.get_segments(matches) try: matches = matches["format"].drop_duplicates() if len(matches.index) == 1: imported_format = matches.iloc[0] except KeyError: imported_format = None output_dir = os.path.join(destination, recording_prefix) create_eaf( etf_path, output_filename, output_dir, recording_filename, timestamps, eaf_type, context_onset, context_offset, template, speech_segments, imported_set, imported_format, ) shutil.copy( pfsx_path, os.path.join(output_dir, "{}.pfsx".format(output_filename)))
def test_rename(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") am.import_annotations(input_annotations[input_annotations["set"] == "textgrid"]) am.read() tg_count = am.annotations[am.annotations["set"] == "textgrid"].shape[0] am.rename_set("textgrid", "renamed") am.read() errors, warnings = am.validate() assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected" assert am.annotations[am.annotations["set"] == "textgrid"].shape[0] == 0 assert am.annotations[am.annotations["set"] == "renamed"].shape[0] == tg_count
def _load(self): self.project = ChildProject(self.ds.path) self.am = AnnotationManager(self.project) self.am.read()
from ChildProject.annotations import AnnotationManager from ChildProject.metrics import gamma, segments_to_grid, grid_to_vector, vectors_to_annotation_task import argparse parser = argparse.ArgumentParser( description= 'compute agreement measures for all given annotators over a whole dataset') parser.add_argument('path', help='path to the dataset') parser.add_argument('--sets', nargs='+', help='sets to include') args = parser.parse_args() speakers = ['CHI', 'OCH', 'FEM', 'MAL'] project = ChildProject(args.path) am = AnnotationManager(project) am.read() intersection = AnnotationManager.intersection(am.annotations, args.sets) segments = am.get_collapsed_segments(intersection) segments = segments[segments['speaker_type'].isin(speakers)] vectors = [ grid_to_vector( segments_to_grid(segments[segments['set'] == s], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers,
def extract_chunks(self, destination, path, annotation_set='vtc', batch_size=1000, target_speaker_type='CHI', sample_size=500, chunk_length=500, threads=0, batches=0, **kwargs): assert 1000 % chunk_length == 0, 'chunk_length should divide 1000' self.destination = destination self.project = ChildProject(path) batch_size = int(batch_size) sample_size = int(sample_size) chunk_length = int(chunk_length) threads = int(threads) self.sample_size = sample_size self.chunk_length = chunk_length am = AnnotationManager(self.project) self.annotations = am.annotations self.annotations = self.annotations[self.annotations['set'] == annotation_set] self.segments = am.get_segments(self.annotations) self.segments = self.segments[self.segments['speaker_type'] == target_speaker_type] self.segments['segment_onset'] = self.segments[ 'segment_onset'] + self.segments['time_seek'] self.segments['segment_offset'] = self.segments[ 'segment_offset'] + self.segments['time_seek'] destination_path = os.path.join(destination, 'chunks') os.makedirs(destination_path, exist_ok=True) if os.listdir(destination_path): raise ValueError( "destination '{}' is not empty, please choose another destination." .format(destination_path)) segments = [] for _recording, _segments in self.segments.groupby( 'recording_filename'): segments.append(_segments.assign(recording_filename=_recording)) pool = mp.Pool(threads if threads > 0 else mp.cpu_count()) self.chunks = pool.map(self.split_recording, segments) self.chunks = itertools.chain.from_iterable(self.chunks) self.chunks = pd.DataFrame([{ 'recording': c.recording, 'onset': c.onset, 'offset': c.offset, 'wav': c.getbasename('wav'), 'mp3': c.getbasename('mp3'), 'speaker_type': target_speaker_type, 'date_extracted': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'uploaded': False, 'project_slug': '', 'subject_set': '', 'zooniverse_id': 0 } for c in self.chunks]) # shuffle chunks so that they can't be joined back together # based on Zooniverse subject IDs self.chunks = self.chunks.sample(frac=1).reset_index(drop=True) self.chunks['batch'] = self.chunks.index.map( lambda x: int(x / batch_size)) self.chunks.index.name = 'index' self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv'))
def test_merge(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") input_annotations = input_annotations[ input_annotations["set"].isin(["vtc_rttm", "alice"]) ] print(input_annotations) am.import_annotations(input_annotations) am.read() print(am.annotations) am.read() am.merge_sets( left_set="vtc_rttm", right_set="alice", left_columns=["speaker_type"], right_columns=["phonemes", "syllables", "words"], output_set="alice_vtc", full_set_merge = False, recording_filter = {'sound.wav'} ) am.read() anns = am.annotations[am.annotations['set'] == 'alice_vtc'] assert anns.shape[0] == 1 assert anns.iloc[0]['recording_filename'] == 'sound.wav' time.sleep(2) #sleeping for 2 seconds to have different 'imported_at' values so that can make sure both merge did fine am.merge_sets( left_set="vtc_rttm", right_set="alice", left_columns=["speaker_type"], right_columns=["phonemes", "syllables", "words"], output_set="alice_vtc", full_set_merge = False, skip_existing = True ) am.read() anns = am.annotations[am.annotations['set'] == 'alice_vtc'] assert anns.shape[0] == 2 assert set(anns['recording_filename'].unique()) == {'sound.wav','sound2.wav'} assert anns.iloc[0]['imported_at'] != anns.iloc[1]['imported_at'] segments = am.get_segments(am.annotations[am.annotations["set"] == "alice_vtc"]) vtc_segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"]) assert segments.shape[0] == vtc_segments.shape[0] assert segments.shape[1] == vtc_segments.shape[1] + 3 adult_segments = ( segments[segments["speaker_type"].isin(["FEM", "MAL"])] .sort_values(["segment_onset", "segment_offset"]) .reset_index(drop=True) ) alice = ( am.get_segments(am.annotations[am.annotations["set"] == "alice"]) .sort_values(["segment_onset", "segment_offset"]) .reset_index(drop=True) ) pd.testing.assert_frame_equal( adult_segments[["phonemes", "syllables", "words"]], alice[["phonemes", "syllables", "words"]], )
def test_periodic(project): """ os.makedirs('output/eaf', exist_ok = True) project = ChildProject('examples/valid_raw_data') project.read() am = AnnotationManager(project) am.read() """ data = pd.read_csv("tests/data/eaf_segments.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "vtc", "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "vtc_rttm", }]), import_function=partial(fake_vocs, data), ) sampler = PeriodicSampler(project, 500, 500, 250, recordings=['sound.wav']) sampler.sample() sampler.segments.to_csv('output/eaf/segments.csv') ranges = sampler.segments.rename(columns={ "segment_onset": "range_onset", "segment_offset": "range_offset", }) annotations = am.get_within_ranges(ranges, [IMP_FROM], 'warn') #annotations = am.annotations[am.annotations["set"] == IMP_FROM].drop_duplicates(['set', 'recording_filename', 'time_seek', 'range_onset', 'range_offset', 'raw_filename', 'format', 'filter'],ignore_index=True) annot_segments = am.get_segments(annotations) eaf_builder = EafBuilderPipeline() eaf_builder.run( destination='output/eaf', segments='output/eaf/segments.csv', eaf_type='periodic', template='basic', context_onset=250, context_offset=250, path='output/eaf', import_speech_from='vtc', ) eaf = Eaf('output/eaf/sound/sound_periodic_basic.eaf') code = eaf.tiers['code_periodic'][0] segments = [] for pid in code: (start_ts, end_ts, value, svg_ref) = code[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), sampler.segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset' ]).reset_index(drop=True)) segments = [] vtc_speech = eaf.tiers['VTC-SPEECH'][0] for pid in vtc_speech: (start_ts, end_ts, value, svg_ref) = vtc_speech[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) speech_segs = annot_segments[pd.isnull(annot_segments['speaker_type'])] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), speech_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_chi = eaf.tiers['VTC-CHI'][0] for pid in vtc_chi: (start_ts, end_ts, value, svg_ref) = vtc_chi[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) chi_segs = annot_segments[annot_segments['speaker_type'] == 'CHI'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), chi_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_och = eaf.tiers['VTC-OCH'][0] for pid in vtc_och: (start_ts, end_ts, value, svg_ref) = vtc_och[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) och_segs = annot_segments[annot_segments['speaker_type'] == 'OCH'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), och_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_fem = eaf.tiers['VTC-FEM'][0] for pid in vtc_fem: (start_ts, end_ts, value, svg_ref) = vtc_fem[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) fem_segs = annot_segments[annot_segments['speaker_type'] == 'FEM'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), fem_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) assert eaf.media_descriptors[0]['MEDIA_URL'] == 'sound.wav'
class MetadataExtractor(BaseMetadataExtractor): def _load(self): self.project = ChildProject(self.ds.path) self.am = AnnotationManager(self.project) self.am.read() def _get_dsmeta(self, dataset, content): recordings = self.project.recordings children = self.project.children ## Extract experiment(s) experiment = None try: experiments = list(recordings['experiment'].unique()) assert len(experiments) == 1 experiment = experiments[0] except Exception as exc: lgr.error("could not determine the experiment ({})".format(str(exc))) dsmeta = { 'experiment': experiment } ## Extract sample size dsmeta['total_children'] = children.shape[0] dsmeta['total_recordings'] = recordings.dropna(subset = ['recording_filename']).shape[0] dsmeta['total_duration'] = int(recordings['duration'].sum()) ## Extract languages languages = [] if 'language' in children.columns: languages.extend(children['language'].str.strip().tolist()) if 'languages' in children.columns: languages.extend(np.ravel(children['languages'].str.split(';').map(lambda s: s.strip()).tolist())) dsmeta['languages'] = list(set(languages)) ### Extract devices dsmeta['devices'] = list(recordings['recording_device_type'].dropna().unique()) ### Vocabulary specifications context = {} context['childproject'] = { '@id': '#', 'description': 'ad-hoc vocabulary for the ChildProject standard', 'type': vocabulary_id, } context.update(vocabulary) dsmeta['@context'] = context return dsmeta def _get_cnmeta(self, dataset, content): cnmeta = [] contents = [{'path': f, 'abspath': os.path.abspath(os.path.join(self.ds.path, f))} for f in self.paths] annotations = self.am.annotations annotations['abspath'] = annotations.apply(lambda row: os.path.join( self.project.path, 'annotations', row['set'], 'converted', row['annotation_filename'] ), axis = 1 ) annotations['abspath'] = annotations['abspath'].apply(os.path.abspath) annotations.sort_values('imported_at', inplace = True) annotations.drop_duplicates( 'abspath', keep = 'last', inplace = True ) annotations = annotations.merge( pd.DataFrame(contents), how = 'inner', left_on = 'abspath', right_on = 'abspath' ) annotations['columns'] = annotations['abspath'].apply(lambda f: ','.join(pd.read_csv(f).dropna(axis=1, how='all').columns) ) cnmeta.extend([ ( annotation['path'], { 'set': annotation['set'], 'format': annotation['format'], 'data': annotation['columns'], 'package_version': annotation['package_version'], 'duration': annotation['range_offset']-annotation['range_onset'] } ) for annotation in annotations.to_dict(orient = 'records') ]) return cnmeta def get_metadata(self, dataset, content): try: self._load() except Exception as exc: lgr.error("could not read the metadata due to some exception.\n{}".format(str(exc))) return {}, [] dsmeta = self._get_dsmeta(dataset, content) cnmeta = self._get_cnmeta(dataset, content) if content else [] return (dsmeta, cnmeta)
from ChildProject.annotations import AnnotationManager import argparse import os parser = argparse.ArgumentParser( description='import and convert VTC annotations into the project') parser.add_argument("--source", help="project path", required=True) parser.add_argument("--overwrite", help="project path", dest='overwrite', action='store_true') args = parser.parse_args() project = ChildProject(args.source) am = AnnotationManager(project) if args.overwrite: am.remove_set('vtc') input = project.recordings[['filename']] input.rename(columns={'filename': 'recording_filename'}, inplace=True) input = input[input['recording_filename'] != 'NA'] input['set'] = 'vtc' input['time_seek'] = 0 input['range_onset'] = 0 input['range_offset'] = 0 input['raw_filename'] = input['recording_filename'].apply( lambda s: os.path.join('vtc', s + '.rttm')) input['format'] = 'vtc_rttm'
#!/usr/bin/env python3 from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager import argparse import os parser = argparse.ArgumentParser(description='import and convert VTC annotations into the project') parser.add_argument("--source", help = "project path", required = True) parser.add_argument("--set", help = "annotation set. the rttm files should lie in <source>/annotations/<set>/raw/", default = 'vtc') parser.add_argument("--overwrite", help = "project path", dest = 'overwrite', action = 'store_true') args = parser.parse_args() project = ChildProject(args.source) am = AnnotationManager(project) if args.overwrite: am.remove_set(args.set) input = project.recordings[['recording_filename', 'duration']] input = input[input['recording_filename'] != 'NA'] input['set'] = args.set input['time_seek'] = 0 input['range_onset'] = 0 input['range_offset'] = input['duration'] input['raw_filename'] = input['recording_filename'].apply(lambda s: os.path.splitext(s)[0] + '.rttm') input['format'] = 'vtc_rttm' am.import_annotations(input, threads = 4)