def test_import(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv(
        'examples/valid_raw_data/raw_annotations/input.csv')
    am.import_annotations(input_annotations)
    am.read()

    assert am.annotations.shape[0] == input_annotations.shape[
        0], "imported annotations length does not match input"

    assert all([
        os.path.exists(os.path.join(project.path, 'annotations', f))
        for f in am.annotations['annotation_filename'].tolist()
    ]), "some annotations are missing"

    errors, warnings = am.validate()
    assert len(errors) == 0 and len(
        warnings) == 0, "malformed annotations detected"

    for dataset in ['eaf', 'textgrid', 'eaf_solis']:
        annotations = am.annotations[am.annotations['set'] == dataset]
        segments = am.get_segments(annotations)
        segments.drop(columns=annotations.columns, inplace=True)

        pd.testing.assert_frame_equal(
            segments.sort_index(axis=1).sort_values(
                segments.columns.tolist()).reset_index(drop=True),
            pd.read_csv('tests/truth/{}.csv'.format(dataset)).sort_index(
                axis=1).sort_values(
                    segments.columns.tolist()).reset_index(drop=True),
            check_less_precise=True)
예제 #2
0
def test_rename(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    am.import_annotations(input_annotations[input_annotations["set"] == "textgrid"])
    am.read()
    tg_count = am.annotations[am.annotations["set"] == "textgrid"].shape[0]

    am.rename_set("textgrid", "renamed")
    am.read()

    errors, warnings = am.validate()
    assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected"

    assert am.annotations[am.annotations["set"] == "textgrid"].shape[0] == 0
    assert am.annotations[am.annotations["set"] == "renamed"].shape[0] == tg_count
예제 #3
0
def test_import(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    am.import_annotations(input_annotations)
    am.read()

    assert (
        am.annotations.shape[0] == input_annotations.shape[0]
    ), "imported annotations length does not match input"

    assert all(
        [
            os.path.exists(
                os.path.join(
                    project.path,
                    "annotations",
                    a["set"],
                    "converted",
                    a["annotation_filename"],
                )
            )
            for a in am.annotations.to_dict(orient="records")
        ]
    ), "some annotations are missing"

    errors, warnings = am.validate()
    assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected"
    
    errors, warnings = am.read()
    assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected"

    for dataset in ["eaf_basic", "textgrid", "eaf_solis"]:
        annotations = am.annotations[am.annotations["set"] == dataset]
        segments = am.get_segments(annotations)
        segments.drop(columns=set(annotations.columns) - {"raw_filename"}, inplace=True)
        truth = pd.read_csv("tests/truth/{}.csv".format(dataset))

        print(segments)
        print(truth)

        pd.testing.assert_frame_equal(
            standardize_dataframe(segments, set(truth.columns.tolist())),
            standardize_dataframe(truth, set(truth.columns.tolist())),
            check_less_precise=True,
        )
def test_clipping(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv(
        'examples/valid_raw_data/raw_annotations/input.csv')
    am.import_annotations(input_annotations)
    am.read()

    start = 1981
    stop = 1984
    segments = am.get_segments(
        am.annotations[am.annotations['set'] == 'vtc_rttm'])
    segments = am.clip_segments(segments, start, stop)

    assert segments['segment_onset'].between(
        start, stop).all() and segments['segment_offset'].between(
            start, stop).all(), "segments not properly clipped"
    assert segments.shape[0] == 2, "got {} segments, expected 2".format(
        segments.shape[0])
예제 #5
0
def test_clipping(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    input_annotations = input_annotations[input_annotations["recording_filename"] == "sound.wav"]
    am.import_annotations(input_annotations[input_annotations["set"] == "vtc_rttm"])
    am.read()

    start = 1981000
    stop = 1984000
    segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"])
    segments = am.clip_segments(segments, start, stop)

    assert (
        segments["segment_onset"].between(start, stop).all()
        and segments["segment_offset"].between(start, stop).all()
    ), "segments not properly clipped"
    assert segments.shape[0] == 2, "got {} segments, expected 2".format(
        segments.shape[0]
    )
예제 #6
0
def test_custom_importation(project):
    am = AnnotationManager(project)
    input = pd.DataFrame(
        [
            {
                "set": "vtc_rttm",
                "range_onset": 0,
                "range_offset": 4000,
                "recording_filename": "sound.wav",
                "time_seek": 0,
                "raw_filename": "example.rttm",
                "format": "custom",
            }
        ]
    )

    am.import_annotations(input, import_function=custom_function)
    am.read()

    errors, warnings = am.validate()
    assert len(errors) == 0
def test_intersect(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv(
        'examples/valid_raw_data/raw_annotations/intersect.csv')
    am.import_annotations(input_annotations)
    am.read()

    a, b = am.intersection(am.annotations[am.annotations['set'] == 'textgrid'],
                           am.annotations[am.annotations['set'] == 'vtc_rttm'])

    pd.testing.assert_frame_equal(
        a.sort_index(axis=1).sort_values(a.columns.tolist()).reset_index(
            drop=True).drop(columns=['imported_at']),
        pd.read_csv('tests/truth/intersect_a.csv').sort_index(
            axis=1).sort_values(a.columns.tolist()).reset_index(
                drop=True).drop(columns=['imported_at']))

    pd.testing.assert_frame_equal(
        b.sort_index(axis=1).sort_values(b.columns.tolist()).reset_index(
            drop=True).drop(columns=['imported_at']),
        pd.read_csv('tests/truth/intersect_b.csv').sort_index(
            axis=1).sort_values(b.columns.tolist()).reset_index(
                drop=True).drop(columns=['imported_at']))
예제 #8
0
def test_merge(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    input_annotations = input_annotations[
        input_annotations["set"].isin(["vtc_rttm", "alice"])
    ]
    print(input_annotations)
    am.import_annotations(input_annotations)
    am.read()

    print(am.annotations)
    am.read()
    am.merge_sets(
        left_set="vtc_rttm",
        right_set="alice",
        left_columns=["speaker_type"],
        right_columns=["phonemes", "syllables", "words"],
        output_set="alice_vtc",
        full_set_merge = False,
        recording_filter = {'sound.wav'}
    )
    am.read()

    anns = am.annotations[am.annotations['set'] == 'alice_vtc']
    assert anns.shape[0] == 1
    assert anns.iloc[0]['recording_filename'] == 'sound.wav'
    
    time.sleep(2) #sleeping for 2 seconds to have different 'imported_at' values so that can make sure both merge did fine
    
    am.merge_sets(
        left_set="vtc_rttm",
        right_set="alice",
        left_columns=["speaker_type"],
        right_columns=["phonemes", "syllables", "words"],
        output_set="alice_vtc",
        full_set_merge = False,
        skip_existing = True
    )
    am.read()
    
    anns = am.annotations[am.annotations['set'] == 'alice_vtc']
    assert anns.shape[0] == 2
    assert set(anns['recording_filename'].unique()) == {'sound.wav','sound2.wav'}
    assert anns.iloc[0]['imported_at'] != anns.iloc[1]['imported_at']
    
    segments = am.get_segments(am.annotations[am.annotations["set"] == "alice_vtc"])
    vtc_segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"])
    assert segments.shape[0] == vtc_segments.shape[0]
    assert segments.shape[1] == vtc_segments.shape[1] + 3

    adult_segments = (
        segments[segments["speaker_type"].isin(["FEM", "MAL"])]
        .sort_values(["segment_onset", "segment_offset"])
        .reset_index(drop=True)
    )
    alice = (
        am.get_segments(am.annotations[am.annotations["set"] == "alice"])
        .sort_values(["segment_onset", "segment_offset"])
        .reset_index(drop=True)
    )

    pd.testing.assert_frame_equal(
        adult_segments[["phonemes", "syllables", "words"]],
        alice[["phonemes", "syllables", "words"]],
    )
예제 #9
0
    def run(
        self,
        destination: str,
        segments: str,
        eaf_type: str,
        template: str,
        context_onset: int = 0,
        context_offset: int = 0,
        path: str = None,
        import_speech_from: str = None,
        **kwargs,
    ):
        """generate .eaf templates based on intervals to code.

        :param path: project path
        :type path: str
        :param destination: eaf destination
        :type destination: str
        :param segments: path to the input segments dataframe
        :type segments: str
        :param eaf_type: eaf-type [random, periodic]
        :type eaf_type: str
        :param template: name of the template to use (basic, native, or non-native)
        :type template: str
        :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context
        :type context_onset: int
        :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context
        :type context_offset: int
        """

        try:
            from importlib import resources
        except ImportError:
            # TODO: Perhaps add this as a dependency to the resources?
            import importlib_resources as resources

        etf_path = "{}.etf".format(template)
        pfsx_path = "{}.pfsx".format(template)

        if template in ["basic", "native", "non-native"]:
            with resources.path("ChildProject.templates", etf_path) as etf:
                etf_path = str(etf)

            with resources.path("ChildProject.templates", pfsx_path) as pfsx:
                pfsx_path = str(pfsx)

        if not os.path.exists(etf_path):
            raise Exception("{} cannot be found".format(etf_path))

        if not os.path.exists(pfsx_path):
            raise Exception("{} cannot be found".format(pfsx_path))

        print("making the " + eaf_type + " eaf file and csv")

        segments = pd.read_csv(segments)

        assert_dataframe("segments", segments, not_empty=True)
        assert_columns_presence(
            "segments",
            segments,
            {"recording_filename", "segment_onset", "segment_offset"},
        )

        imported_set = None
        prefill = path and import_speech_from
        if prefill:
            project = ChildProject(path)
            am = AnnotationManager(project)
            am.read()
            imported_set = import_speech_from

        for recording_filename, segs in segments.groupby("recording_filename"):
            recording_prefix = os.path.splitext(recording_filename)[0]
            output_filename = (recording_prefix + "_" + eaf_type + "_" +
                               os.path.basename(template))

            # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here.
            timestamps = [(on, off) for on, off in
                          segs.loc[:,
                                   ["segment_onset", "segment_offset"]].values]

            speech_segments = None
            imported_format = None
            if prefill:
                ranges = segs.assign(
                    recording_filename=recording_filename).rename(
                        columns={
                            "segment_onset": "range_onset",
                            "segment_offset": "range_offset",
                        })
                matches = am.get_within_ranges(ranges, [import_speech_from],
                                               'warn')

                if len(matches) == 0:
                    continue

                speech_segments = am.get_segments(matches)
                try:
                    matches = matches["format"].drop_duplicates()
                    if len(matches.index) == 1:
                        imported_format = matches.iloc[0]
                except KeyError:
                    imported_format = None

            output_dir = os.path.join(destination, recording_prefix)

            create_eaf(
                etf_path,
                output_filename,
                output_dir,
                recording_filename,
                timestamps,
                eaf_type,
                context_onset,
                context_offset,
                template,
                speech_segments,
                imported_set,
                imported_format,
            )

            shutil.copy(
                pfsx_path,
                os.path.join(output_dir, "{}.pfsx".format(output_filename)))
예제 #10
0
from ChildProject.metrics import gamma, segments_to_grid, grid_to_vector, vectors_to_annotation_task

import argparse

parser = argparse.ArgumentParser(
    description=
    'compute agreement measures for all given annotators over a whole dataset')
parser.add_argument('path', help='path to the dataset')
parser.add_argument('--sets', nargs='+', help='sets to include')
args = parser.parse_args()

speakers = ['CHI', 'OCH', 'FEM', 'MAL']

project = ChildProject(args.path)
am = AnnotationManager(project)
am.read()

intersection = AnnotationManager.intersection(am.annotations, args.sets)
segments = am.get_collapsed_segments(intersection)

segments = segments[segments['speaker_type'].isin(speakers)]

vectors = [
    grid_to_vector(
        segments_to_grid(segments[segments['set'] == s],
                         0,
                         segments['segment_offset'].max(),
                         100,
                         'speaker_type',
                         speakers,
                         none=False), speakers) for s in args.sets
예제 #11
0
class MetadataExtractor(BaseMetadataExtractor):
    def _load(self):
        self.project = ChildProject(self.ds.path)
        self.am = AnnotationManager(self.project)
        self.am.read()

    def _get_dsmeta(self, dataset, content):
        recordings = self.project.recordings
        children = self.project.children

        ## Extract experiment(s)
        experiment = None
        try:
            experiments = list(recordings['experiment'].unique())
            assert len(experiments) == 1
            experiment = experiments[0]
        except Exception as exc:
            lgr.error("could not determine the experiment ({})".format(str(exc)))

        dsmeta = {
            'experiment': experiment
        }

        ## Extract sample size
        dsmeta['total_children'] = children.shape[0]
        dsmeta['total_recordings'] = recordings.dropna(subset = ['recording_filename']).shape[0]
        dsmeta['total_duration'] = int(recordings['duration'].sum())

        ## Extract languages
        languages = []
        if 'language' in children.columns:
            languages.extend(children['language'].str.strip().tolist())
        
        if 'languages' in children.columns:
            languages.extend(np.ravel(children['languages'].str.split(';').map(lambda s: s.strip()).tolist()))

        dsmeta['languages'] = list(set(languages))

        ### Extract devices
        dsmeta['devices'] = list(recordings['recording_device_type'].dropna().unique())

        ### Vocabulary specifications
        context = {}
        context['childproject'] = {
            '@id': '#',
            'description': 'ad-hoc vocabulary for the ChildProject standard',
            'type': vocabulary_id,
        }
        context.update(vocabulary)
        dsmeta['@context'] = context

        return dsmeta

    def _get_cnmeta(self, dataset, content):
        cnmeta = []
        contents = [{'path': f, 'abspath': os.path.abspath(os.path.join(self.ds.path, f))} for f in self.paths]

        annotations = self.am.annotations

        annotations['abspath'] = annotations.apply(lambda row:
            os.path.join(
                self.project.path,
                'annotations',
                row['set'],
                'converted',
                row['annotation_filename']
            ),
            axis = 1
        )
        annotations['abspath'] = annotations['abspath'].apply(os.path.abspath)
        annotations.sort_values('imported_at', inplace = True)
        annotations.drop_duplicates(
            'abspath',
            keep = 'last',
            inplace = True
        )
        annotations = annotations.merge(
            pd.DataFrame(contents),
            how = 'inner',
            left_on = 'abspath',
            right_on = 'abspath'
        )
        annotations['columns'] = annotations['abspath'].apply(lambda f:
            ','.join(pd.read_csv(f).dropna(axis=1, how='all').columns)
        )
        
        cnmeta.extend([
            (
                annotation['path'],
                {
                    'set': annotation['set'],
                    'format': annotation['format'],
                    'data': annotation['columns'],
                    'package_version': annotation['package_version'],
                    'duration': annotation['range_offset']-annotation['range_onset']
                }
            )
            for annotation in annotations.to_dict(orient = 'records')
        ])
        
        return cnmeta

    def get_metadata(self, dataset, content):
        try:
            self._load()
        except Exception as exc:
            lgr.error("could not read the metadata due to some exception.\n{}".format(str(exc)))
            return {}, []

        dsmeta = self._get_dsmeta(dataset, content)
        cnmeta = self._get_cnmeta(dataset, content) if content else []

        return (dsmeta, cnmeta)