def test_import(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/input.csv') am.import_annotations(input_annotations) am.read() assert am.annotations.shape[0] == input_annotations.shape[ 0], "imported annotations length does not match input" assert all([ os.path.exists(os.path.join(project.path, 'annotations', f)) for f in am.annotations['annotation_filename'].tolist() ]), "some annotations are missing" errors, warnings = am.validate() assert len(errors) == 0 and len( warnings) == 0, "malformed annotations detected" for dataset in ['eaf', 'textgrid', 'eaf_solis']: annotations = am.annotations[am.annotations['set'] == dataset] segments = am.get_segments(annotations) segments.drop(columns=annotations.columns, inplace=True) pd.testing.assert_frame_equal( segments.sort_index(axis=1).sort_values( segments.columns.tolist()).reset_index(drop=True), pd.read_csv('tests/truth/{}.csv'.format(dataset)).sort_index( axis=1).sort_values( segments.columns.tolist()).reset_index(drop=True), check_less_precise=True)
def test_import(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") am.import_annotations(input_annotations) am.read() assert ( am.annotations.shape[0] == input_annotations.shape[0] ), "imported annotations length does not match input" assert all( [ os.path.exists( os.path.join( project.path, "annotations", a["set"], "converted", a["annotation_filename"], ) ) for a in am.annotations.to_dict(orient="records") ] ), "some annotations are missing" errors, warnings = am.validate() assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected" errors, warnings = am.read() assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected" for dataset in ["eaf_basic", "textgrid", "eaf_solis"]: annotations = am.annotations[am.annotations["set"] == dataset] segments = am.get_segments(annotations) segments.drop(columns=set(annotations.columns) - {"raw_filename"}, inplace=True) truth = pd.read_csv("tests/truth/{}.csv".format(dataset)) print(segments) print(truth) pd.testing.assert_frame_equal( standardize_dataframe(segments, set(truth.columns.tolist())), standardize_dataframe(truth, set(truth.columns.tolist())), check_less_precise=True, )
def test_vc_stats(project, turntakingthresh): am = AnnotationManager(project) am.import_annotations( pd.read_csv('examples/valid_raw_data/raw_annotations/input.csv')) raw_rttm = 'example_metrics.rttm' segments = am.annotations[am.annotations['raw_filename'] == raw_rttm] vc = am.get_vc_stats(am.get_segments(segments), turntakingthresh=turntakingthresh).reset_index() truth_vc = pd.read_csv( 'tests/truth/vc_truth_{:.1f}.csv'.format(turntakingthresh)) pd.testing.assert_frame_equal( vc.reset_index().sort_index(axis=1).sort_values(vc.columns.tolist()), truth_vc.reset_index().sort_index(axis=1).sort_values( vc.columns.tolist()), atol=3)
def test_clipping(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/input.csv') am.import_annotations(input_annotations) am.read() start = 1981 stop = 1984 segments = am.get_segments( am.annotations[am.annotations['set'] == 'vtc_rttm']) segments = am.clip_segments(segments, start, stop) assert segments['segment_onset'].between( start, stop).all() and segments['segment_offset'].between( start, stop).all(), "segments not properly clipped" assert segments.shape[0] == 2, "got {} segments, expected 2".format( segments.shape[0])
def test_clipping(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") input_annotations = input_annotations[input_annotations["recording_filename"] == "sound.wav"] am.import_annotations(input_annotations[input_annotations["set"] == "vtc_rttm"]) am.read() start = 1981000 stop = 1984000 segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"]) segments = am.clip_segments(segments, start, stop) assert ( segments["segment_onset"].between(start, stop).all() and segments["segment_offset"].between(start, stop).all() ), "segments not properly clipped" assert segments.shape[0] == 2, "got {} segments, expected 2".format( segments.shape[0] )
def test_merge(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") input_annotations = input_annotations[ input_annotations["set"].isin(["vtc_rttm", "alice"]) ] print(input_annotations) am.import_annotations(input_annotations) am.read() print(am.annotations) am.read() am.merge_sets( left_set="vtc_rttm", right_set="alice", left_columns=["speaker_type"], right_columns=["phonemes", "syllables", "words"], output_set="alice_vtc", full_set_merge = False, recording_filter = {'sound.wav'} ) am.read() anns = am.annotations[am.annotations['set'] == 'alice_vtc'] assert anns.shape[0] == 1 assert anns.iloc[0]['recording_filename'] == 'sound.wav' time.sleep(2) #sleeping for 2 seconds to have different 'imported_at' values so that can make sure both merge did fine am.merge_sets( left_set="vtc_rttm", right_set="alice", left_columns=["speaker_type"], right_columns=["phonemes", "syllables", "words"], output_set="alice_vtc", full_set_merge = False, skip_existing = True ) am.read() anns = am.annotations[am.annotations['set'] == 'alice_vtc'] assert anns.shape[0] == 2 assert set(anns['recording_filename'].unique()) == {'sound.wav','sound2.wav'} assert anns.iloc[0]['imported_at'] != anns.iloc[1]['imported_at'] segments = am.get_segments(am.annotations[am.annotations["set"] == "alice_vtc"]) vtc_segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"]) assert segments.shape[0] == vtc_segments.shape[0] assert segments.shape[1] == vtc_segments.shape[1] + 3 adult_segments = ( segments[segments["speaker_type"].isin(["FEM", "MAL"])] .sort_values(["segment_onset", "segment_offset"]) .reset_index(drop=True) ) alice = ( am.get_segments(am.annotations[am.annotations["set"] == "alice"]) .sort_values(["segment_onset", "segment_offset"]) .reset_index(drop=True) ) pd.testing.assert_frame_equal( adult_segments[["phonemes", "syllables", "words"]], alice[["phonemes", "syllables", "words"]], )
def run( self, destination: str, segments: str, eaf_type: str, template: str, context_onset: int = 0, context_offset: int = 0, path: str = None, import_speech_from: str = None, **kwargs, ): """generate .eaf templates based on intervals to code. :param path: project path :type path: str :param destination: eaf destination :type destination: str :param segments: path to the input segments dataframe :type segments: str :param eaf_type: eaf-type [random, periodic] :type eaf_type: str :param template: name of the template to use (basic, native, or non-native) :type template: str :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context :type context_onset: int :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context :type context_offset: int """ try: from importlib import resources except ImportError: # TODO: Perhaps add this as a dependency to the resources? import importlib_resources as resources etf_path = "{}.etf".format(template) pfsx_path = "{}.pfsx".format(template) if template in ["basic", "native", "non-native"]: with resources.path("ChildProject.templates", etf_path) as etf: etf_path = str(etf) with resources.path("ChildProject.templates", pfsx_path) as pfsx: pfsx_path = str(pfsx) if not os.path.exists(etf_path): raise Exception("{} cannot be found".format(etf_path)) if not os.path.exists(pfsx_path): raise Exception("{} cannot be found".format(pfsx_path)) print("making the " + eaf_type + " eaf file and csv") segments = pd.read_csv(segments) assert_dataframe("segments", segments, not_empty=True) assert_columns_presence( "segments", segments, {"recording_filename", "segment_onset", "segment_offset"}, ) imported_set = None prefill = path and import_speech_from if prefill: project = ChildProject(path) am = AnnotationManager(project) am.read() imported_set = import_speech_from for recording_filename, segs in segments.groupby("recording_filename"): recording_prefix = os.path.splitext(recording_filename)[0] output_filename = (recording_prefix + "_" + eaf_type + "_" + os.path.basename(template)) # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here. timestamps = [(on, off) for on, off in segs.loc[:, ["segment_onset", "segment_offset"]].values] speech_segments = None imported_format = None if prefill: ranges = segs.assign( recording_filename=recording_filename).rename( columns={ "segment_onset": "range_onset", "segment_offset": "range_offset", }) matches = am.get_within_ranges(ranges, [import_speech_from], 'warn') if len(matches) == 0: continue speech_segments = am.get_segments(matches) try: matches = matches["format"].drop_duplicates() if len(matches.index) == 1: imported_format = matches.iloc[0] except KeyError: imported_format = None output_dir = os.path.join(destination, recording_prefix) create_eaf( etf_path, output_filename, output_dir, recording_filename, timestamps, eaf_type, context_onset, context_offset, template, speech_segments, imported_set, imported_format, ) shutil.copy( pfsx_path, os.path.join(output_dir, "{}.pfsx".format(output_filename)))
def test_periodic(project): """ os.makedirs('output/eaf', exist_ok = True) project = ChildProject('examples/valid_raw_data') project.read() am = AnnotationManager(project) am.read() """ data = pd.read_csv("tests/data/eaf_segments.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "vtc", "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "vtc_rttm", }]), import_function=partial(fake_vocs, data), ) sampler = PeriodicSampler(project, 500, 500, 250, recordings=['sound.wav']) sampler.sample() sampler.segments.to_csv('output/eaf/segments.csv') ranges = sampler.segments.rename(columns={ "segment_onset": "range_onset", "segment_offset": "range_offset", }) annotations = am.get_within_ranges(ranges, [IMP_FROM], 'warn') #annotations = am.annotations[am.annotations["set"] == IMP_FROM].drop_duplicates(['set', 'recording_filename', 'time_seek', 'range_onset', 'range_offset', 'raw_filename', 'format', 'filter'],ignore_index=True) annot_segments = am.get_segments(annotations) eaf_builder = EafBuilderPipeline() eaf_builder.run( destination='output/eaf', segments='output/eaf/segments.csv', eaf_type='periodic', template='basic', context_onset=250, context_offset=250, path='output/eaf', import_speech_from='vtc', ) eaf = Eaf('output/eaf/sound/sound_periodic_basic.eaf') code = eaf.tiers['code_periodic'][0] segments = [] for pid in code: (start_ts, end_ts, value, svg_ref) = code[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), sampler.segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset' ]).reset_index(drop=True)) segments = [] vtc_speech = eaf.tiers['VTC-SPEECH'][0] for pid in vtc_speech: (start_ts, end_ts, value, svg_ref) = vtc_speech[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) speech_segs = annot_segments[pd.isnull(annot_segments['speaker_type'])] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), speech_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_chi = eaf.tiers['VTC-CHI'][0] for pid in vtc_chi: (start_ts, end_ts, value, svg_ref) = vtc_chi[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) chi_segs = annot_segments[annot_segments['speaker_type'] == 'CHI'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), chi_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_och = eaf.tiers['VTC-OCH'][0] for pid in vtc_och: (start_ts, end_ts, value, svg_ref) = vtc_och[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) och_segs = annot_segments[annot_segments['speaker_type'] == 'OCH'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), och_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_fem = eaf.tiers['VTC-FEM'][0] for pid in vtc_fem: (start_ts, end_ts, value, svg_ref) = vtc_fem[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) fem_segs = annot_segments[annot_segments['speaker_type'] == 'FEM'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), fem_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) assert eaf.media_descriptors[0]['MEDIA_URL'] == 'sound.wav'
def extract_chunks(self, destination, path, annotation_set='vtc', batch_size=1000, target_speaker_type='CHI', sample_size=500, chunk_length=500, threads=0, batches=0, **kwargs): assert 1000 % chunk_length == 0, 'chunk_length should divide 1000' self.destination = destination self.project = ChildProject(path) batch_size = int(batch_size) sample_size = int(sample_size) chunk_length = int(chunk_length) threads = int(threads) self.sample_size = sample_size self.chunk_length = chunk_length am = AnnotationManager(self.project) self.annotations = am.annotations self.annotations = self.annotations[self.annotations['set'] == annotation_set] self.segments = am.get_segments(self.annotations) self.segments = self.segments[self.segments['speaker_type'] == target_speaker_type] self.segments['segment_onset'] = self.segments[ 'segment_onset'] + self.segments['time_seek'] self.segments['segment_offset'] = self.segments[ 'segment_offset'] + self.segments['time_seek'] destination_path = os.path.join(destination, 'chunks') os.makedirs(destination_path, exist_ok=True) if os.listdir(destination_path): raise ValueError( "destination '{}' is not empty, please choose another destination." .format(destination_path)) segments = [] for _recording, _segments in self.segments.groupby( 'recording_filename'): segments.append(_segments.assign(recording_filename=_recording)) pool = mp.Pool(threads if threads > 0 else mp.cpu_count()) self.chunks = pool.map(self.split_recording, segments) self.chunks = itertools.chain.from_iterable(self.chunks) self.chunks = pd.DataFrame([{ 'recording': c.recording, 'onset': c.onset, 'offset': c.offset, 'wav': c.getbasename('wav'), 'mp3': c.getbasename('mp3'), 'speaker_type': target_speaker_type, 'date_extracted': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'uploaded': False, 'project_slug': '', 'subject_set': '', 'zooniverse_id': 0 } for c in self.chunks]) # shuffle chunks so that they can't be joined back together # based on Zooniverse subject IDs self.chunks = self.chunks.sample(frac=1).reset_index(drop=True) self.chunks['batch'] = self.chunks.index.map( lambda x: int(x / batch_size)) self.chunks.index.name = 'index' self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv'))