def test_import(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/input.csv') am.import_annotations(input_annotations) am.read() assert am.annotations.shape[0] == input_annotations.shape[ 0], "imported annotations length does not match input" assert all([ os.path.exists(os.path.join(project.path, 'annotations', f)) for f in am.annotations['annotation_filename'].tolist() ]), "some annotations are missing" errors, warnings = am.validate() assert len(errors) == 0 and len( warnings) == 0, "malformed annotations detected" for dataset in ['eaf', 'textgrid', 'eaf_solis']: annotations = am.annotations[am.annotations['set'] == dataset] segments = am.get_segments(annotations) segments.drop(columns=annotations.columns, inplace=True) pd.testing.assert_frame_equal( segments.sort_index(axis=1).sort_values( segments.columns.tolist()).reset_index(drop=True), pd.read_csv('tests/truth/{}.csv'.format(dataset)).sort_index( axis=1).sort_values( segments.columns.tolist()).reset_index(drop=True), check_less_precise=True)
def test_rename(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") am.import_annotations(input_annotations[input_annotations["set"] == "textgrid"]) am.read() tg_count = am.annotations[am.annotations["set"] == "textgrid"].shape[0] am.rename_set("textgrid", "renamed") am.read() errors, warnings = am.validate() assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected" assert am.annotations[am.annotations["set"] == "textgrid"].shape[0] == 0 assert am.annotations[am.annotations["set"] == "renamed"].shape[0] == tg_count
def test_import(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") am.import_annotations(input_annotations) am.read() assert ( am.annotations.shape[0] == input_annotations.shape[0] ), "imported annotations length does not match input" assert all( [ os.path.exists( os.path.join( project.path, "annotations", a["set"], "converted", a["annotation_filename"], ) ) for a in am.annotations.to_dict(orient="records") ] ), "some annotations are missing" errors, warnings = am.validate() assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected" errors, warnings = am.read() assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected" for dataset in ["eaf_basic", "textgrid", "eaf_solis"]: annotations = am.annotations[am.annotations["set"] == dataset] segments = am.get_segments(annotations) segments.drop(columns=set(annotations.columns) - {"raw_filename"}, inplace=True) truth = pd.read_csv("tests/truth/{}.csv".format(dataset)) print(segments) print(truth) pd.testing.assert_frame_equal( standardize_dataframe(segments, set(truth.columns.tolist())), standardize_dataframe(truth, set(truth.columns.tolist())), check_less_precise=True, )
def test_clipping(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/input.csv') am.import_annotations(input_annotations) am.read() start = 1981 stop = 1984 segments = am.get_segments( am.annotations[am.annotations['set'] == 'vtc_rttm']) segments = am.clip_segments(segments, start, stop) assert segments['segment_onset'].between( start, stop).all() and segments['segment_offset'].between( start, stop).all(), "segments not properly clipped" assert segments.shape[0] == 2, "got {} segments, expected 2".format( segments.shape[0])
def test_clipping(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") input_annotations = input_annotations[input_annotations["recording_filename"] == "sound.wav"] am.import_annotations(input_annotations[input_annotations["set"] == "vtc_rttm"]) am.read() start = 1981000 stop = 1984000 segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"]) segments = am.clip_segments(segments, start, stop) assert ( segments["segment_onset"].between(start, stop).all() and segments["segment_offset"].between(start, stop).all() ), "segments not properly clipped" assert segments.shape[0] == 2, "got {} segments, expected 2".format( segments.shape[0] )
def test_custom_importation(project): am = AnnotationManager(project) input = pd.DataFrame( [ { "set": "vtc_rttm", "range_onset": 0, "range_offset": 4000, "recording_filename": "sound.wav", "time_seek": 0, "raw_filename": "example.rttm", "format": "custom", } ] ) am.import_annotations(input, import_function=custom_function) am.read() errors, warnings = am.validate() assert len(errors) == 0
def test_intersect(project): am = AnnotationManager(project) input_annotations = pd.read_csv( 'examples/valid_raw_data/raw_annotations/intersect.csv') am.import_annotations(input_annotations) am.read() a, b = am.intersection(am.annotations[am.annotations['set'] == 'textgrid'], am.annotations[am.annotations['set'] == 'vtc_rttm']) pd.testing.assert_frame_equal( a.sort_index(axis=1).sort_values(a.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']), pd.read_csv('tests/truth/intersect_a.csv').sort_index( axis=1).sort_values(a.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at'])) pd.testing.assert_frame_equal( b.sort_index(axis=1).sort_values(b.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']), pd.read_csv('tests/truth/intersect_b.csv').sort_index( axis=1).sort_values(b.columns.tolist()).reset_index( drop=True).drop(columns=['imported_at']))
def test_merge(project): am = AnnotationManager(project) input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv") input_annotations = input_annotations[ input_annotations["set"].isin(["vtc_rttm", "alice"]) ] print(input_annotations) am.import_annotations(input_annotations) am.read() print(am.annotations) am.read() am.merge_sets( left_set="vtc_rttm", right_set="alice", left_columns=["speaker_type"], right_columns=["phonemes", "syllables", "words"], output_set="alice_vtc", full_set_merge = False, recording_filter = {'sound.wav'} ) am.read() anns = am.annotations[am.annotations['set'] == 'alice_vtc'] assert anns.shape[0] == 1 assert anns.iloc[0]['recording_filename'] == 'sound.wav' time.sleep(2) #sleeping for 2 seconds to have different 'imported_at' values so that can make sure both merge did fine am.merge_sets( left_set="vtc_rttm", right_set="alice", left_columns=["speaker_type"], right_columns=["phonemes", "syllables", "words"], output_set="alice_vtc", full_set_merge = False, skip_existing = True ) am.read() anns = am.annotations[am.annotations['set'] == 'alice_vtc'] assert anns.shape[0] == 2 assert set(anns['recording_filename'].unique()) == {'sound.wav','sound2.wav'} assert anns.iloc[0]['imported_at'] != anns.iloc[1]['imported_at'] segments = am.get_segments(am.annotations[am.annotations["set"] == "alice_vtc"]) vtc_segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"]) assert segments.shape[0] == vtc_segments.shape[0] assert segments.shape[1] == vtc_segments.shape[1] + 3 adult_segments = ( segments[segments["speaker_type"].isin(["FEM", "MAL"])] .sort_values(["segment_onset", "segment_offset"]) .reset_index(drop=True) ) alice = ( am.get_segments(am.annotations[am.annotations["set"] == "alice"]) .sort_values(["segment_onset", "segment_offset"]) .reset_index(drop=True) ) pd.testing.assert_frame_equal( adult_segments[["phonemes", "syllables", "words"]], alice[["phonemes", "syllables", "words"]], )
def run( self, destination: str, segments: str, eaf_type: str, template: str, context_onset: int = 0, context_offset: int = 0, path: str = None, import_speech_from: str = None, **kwargs, ): """generate .eaf templates based on intervals to code. :param path: project path :type path: str :param destination: eaf destination :type destination: str :param segments: path to the input segments dataframe :type segments: str :param eaf_type: eaf-type [random, periodic] :type eaf_type: str :param template: name of the template to use (basic, native, or non-native) :type template: str :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context :type context_onset: int :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context :type context_offset: int """ try: from importlib import resources except ImportError: # TODO: Perhaps add this as a dependency to the resources? import importlib_resources as resources etf_path = "{}.etf".format(template) pfsx_path = "{}.pfsx".format(template) if template in ["basic", "native", "non-native"]: with resources.path("ChildProject.templates", etf_path) as etf: etf_path = str(etf) with resources.path("ChildProject.templates", pfsx_path) as pfsx: pfsx_path = str(pfsx) if not os.path.exists(etf_path): raise Exception("{} cannot be found".format(etf_path)) if not os.path.exists(pfsx_path): raise Exception("{} cannot be found".format(pfsx_path)) print("making the " + eaf_type + " eaf file and csv") segments = pd.read_csv(segments) assert_dataframe("segments", segments, not_empty=True) assert_columns_presence( "segments", segments, {"recording_filename", "segment_onset", "segment_offset"}, ) imported_set = None prefill = path and import_speech_from if prefill: project = ChildProject(path) am = AnnotationManager(project) am.read() imported_set = import_speech_from for recording_filename, segs in segments.groupby("recording_filename"): recording_prefix = os.path.splitext(recording_filename)[0] output_filename = (recording_prefix + "_" + eaf_type + "_" + os.path.basename(template)) # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here. timestamps = [(on, off) for on, off in segs.loc[:, ["segment_onset", "segment_offset"]].values] speech_segments = None imported_format = None if prefill: ranges = segs.assign( recording_filename=recording_filename).rename( columns={ "segment_onset": "range_onset", "segment_offset": "range_offset", }) matches = am.get_within_ranges(ranges, [import_speech_from], 'warn') if len(matches) == 0: continue speech_segments = am.get_segments(matches) try: matches = matches["format"].drop_duplicates() if len(matches.index) == 1: imported_format = matches.iloc[0] except KeyError: imported_format = None output_dir = os.path.join(destination, recording_prefix) create_eaf( etf_path, output_filename, output_dir, recording_filename, timestamps, eaf_type, context_onset, context_offset, template, speech_segments, imported_set, imported_format, ) shutil.copy( pfsx_path, os.path.join(output_dir, "{}.pfsx".format(output_filename)))
from ChildProject.metrics import gamma, segments_to_grid, grid_to_vector, vectors_to_annotation_task import argparse parser = argparse.ArgumentParser( description= 'compute agreement measures for all given annotators over a whole dataset') parser.add_argument('path', help='path to the dataset') parser.add_argument('--sets', nargs='+', help='sets to include') args = parser.parse_args() speakers = ['CHI', 'OCH', 'FEM', 'MAL'] project = ChildProject(args.path) am = AnnotationManager(project) am.read() intersection = AnnotationManager.intersection(am.annotations, args.sets) segments = am.get_collapsed_segments(intersection) segments = segments[segments['speaker_type'].isin(speakers)] vectors = [ grid_to_vector( segments_to_grid(segments[segments['set'] == s], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers, none=False), speakers) for s in args.sets
class MetadataExtractor(BaseMetadataExtractor): def _load(self): self.project = ChildProject(self.ds.path) self.am = AnnotationManager(self.project) self.am.read() def _get_dsmeta(self, dataset, content): recordings = self.project.recordings children = self.project.children ## Extract experiment(s) experiment = None try: experiments = list(recordings['experiment'].unique()) assert len(experiments) == 1 experiment = experiments[0] except Exception as exc: lgr.error("could not determine the experiment ({})".format(str(exc))) dsmeta = { 'experiment': experiment } ## Extract sample size dsmeta['total_children'] = children.shape[0] dsmeta['total_recordings'] = recordings.dropna(subset = ['recording_filename']).shape[0] dsmeta['total_duration'] = int(recordings['duration'].sum()) ## Extract languages languages = [] if 'language' in children.columns: languages.extend(children['language'].str.strip().tolist()) if 'languages' in children.columns: languages.extend(np.ravel(children['languages'].str.split(';').map(lambda s: s.strip()).tolist())) dsmeta['languages'] = list(set(languages)) ### Extract devices dsmeta['devices'] = list(recordings['recording_device_type'].dropna().unique()) ### Vocabulary specifications context = {} context['childproject'] = { '@id': '#', 'description': 'ad-hoc vocabulary for the ChildProject standard', 'type': vocabulary_id, } context.update(vocabulary) dsmeta['@context'] = context return dsmeta def _get_cnmeta(self, dataset, content): cnmeta = [] contents = [{'path': f, 'abspath': os.path.abspath(os.path.join(self.ds.path, f))} for f in self.paths] annotations = self.am.annotations annotations['abspath'] = annotations.apply(lambda row: os.path.join( self.project.path, 'annotations', row['set'], 'converted', row['annotation_filename'] ), axis = 1 ) annotations['abspath'] = annotations['abspath'].apply(os.path.abspath) annotations.sort_values('imported_at', inplace = True) annotations.drop_duplicates( 'abspath', keep = 'last', inplace = True ) annotations = annotations.merge( pd.DataFrame(contents), how = 'inner', left_on = 'abspath', right_on = 'abspath' ) annotations['columns'] = annotations['abspath'].apply(lambda f: ','.join(pd.read_csv(f).dropna(axis=1, how='all').columns) ) cnmeta.extend([ ( annotation['path'], { 'set': annotation['set'], 'format': annotation['format'], 'data': annotation['columns'], 'package_version': annotation['package_version'], 'duration': annotation['range_offset']-annotation['range_onset'] } ) for annotation in annotations.to_dict(orient = 'records') ]) return cnmeta def get_metadata(self, dataset, content): try: self._load() except Exception as exc: lgr.error("could not read the metadata due to some exception.\n{}".format(str(exc))) return {}, [] dsmeta = self._get_dsmeta(dataset, content) cnmeta = self._get_cnmeta(dataset, content) if content else [] return (dsmeta, cnmeta)