def test_import(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv(
        'examples/valid_raw_data/raw_annotations/input.csv')
    am.import_annotations(input_annotations)
    am.read()

    assert am.annotations.shape[0] == input_annotations.shape[
        0], "imported annotations length does not match input"

    assert all([
        os.path.exists(os.path.join(project.path, 'annotations', f))
        for f in am.annotations['annotation_filename'].tolist()
    ]), "some annotations are missing"

    errors, warnings = am.validate()
    assert len(errors) == 0 and len(
        warnings) == 0, "malformed annotations detected"

    for dataset in ['eaf', 'textgrid', 'eaf_solis']:
        annotations = am.annotations[am.annotations['set'] == dataset]
        segments = am.get_segments(annotations)
        segments.drop(columns=annotations.columns, inplace=True)

        pd.testing.assert_frame_equal(
            segments.sort_index(axis=1).sort_values(
                segments.columns.tolist()).reset_index(drop=True),
            pd.read_csv('tests/truth/{}.csv'.format(dataset)).sort_index(
                axis=1).sort_values(
                    segments.columns.tolist()).reset_index(drop=True),
            check_less_precise=True)
Example #2
0
def test_import(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    am.import_annotations(input_annotations)
    am.read()

    assert (
        am.annotations.shape[0] == input_annotations.shape[0]
    ), "imported annotations length does not match input"

    assert all(
        [
            os.path.exists(
                os.path.join(
                    project.path,
                    "annotations",
                    a["set"],
                    "converted",
                    a["annotation_filename"],
                )
            )
            for a in am.annotations.to_dict(orient="records")
        ]
    ), "some annotations are missing"

    errors, warnings = am.validate()
    assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected"
    
    errors, warnings = am.read()
    assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected"

    for dataset in ["eaf_basic", "textgrid", "eaf_solis"]:
        annotations = am.annotations[am.annotations["set"] == dataset]
        segments = am.get_segments(annotations)
        segments.drop(columns=set(annotations.columns) - {"raw_filename"}, inplace=True)
        truth = pd.read_csv("tests/truth/{}.csv".format(dataset))

        print(segments)
        print(truth)

        pd.testing.assert_frame_equal(
            standardize_dataframe(segments, set(truth.columns.tolist())),
            standardize_dataframe(truth, set(truth.columns.tolist())),
            check_less_precise=True,
        )
def test_vc_stats(project, turntakingthresh):
    am = AnnotationManager(project)
    am.import_annotations(
        pd.read_csv('examples/valid_raw_data/raw_annotations/input.csv'))

    raw_rttm = 'example_metrics.rttm'
    segments = am.annotations[am.annotations['raw_filename'] == raw_rttm]

    vc = am.get_vc_stats(am.get_segments(segments),
                         turntakingthresh=turntakingthresh).reset_index()
    truth_vc = pd.read_csv(
        'tests/truth/vc_truth_{:.1f}.csv'.format(turntakingthresh))

    pd.testing.assert_frame_equal(
        vc.reset_index().sort_index(axis=1).sort_values(vc.columns.tolist()),
        truth_vc.reset_index().sort_index(axis=1).sort_values(
            vc.columns.tolist()),
        atol=3)
def test_clipping(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv(
        'examples/valid_raw_data/raw_annotations/input.csv')
    am.import_annotations(input_annotations)
    am.read()

    start = 1981
    stop = 1984
    segments = am.get_segments(
        am.annotations[am.annotations['set'] == 'vtc_rttm'])
    segments = am.clip_segments(segments, start, stop)

    assert segments['segment_onset'].between(
        start, stop).all() and segments['segment_offset'].between(
            start, stop).all(), "segments not properly clipped"
    assert segments.shape[0] == 2, "got {} segments, expected 2".format(
        segments.shape[0])
Example #5
0
def test_clipping(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    input_annotations = input_annotations[input_annotations["recording_filename"] == "sound.wav"]
    am.import_annotations(input_annotations[input_annotations["set"] == "vtc_rttm"])
    am.read()

    start = 1981000
    stop = 1984000
    segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"])
    segments = am.clip_segments(segments, start, stop)

    assert (
        segments["segment_onset"].between(start, stop).all()
        and segments["segment_offset"].between(start, stop).all()
    ), "segments not properly clipped"
    assert segments.shape[0] == 2, "got {} segments, expected 2".format(
        segments.shape[0]
    )
Example #6
0
def test_merge(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    input_annotations = input_annotations[
        input_annotations["set"].isin(["vtc_rttm", "alice"])
    ]
    print(input_annotations)
    am.import_annotations(input_annotations)
    am.read()

    print(am.annotations)
    am.read()
    am.merge_sets(
        left_set="vtc_rttm",
        right_set="alice",
        left_columns=["speaker_type"],
        right_columns=["phonemes", "syllables", "words"],
        output_set="alice_vtc",
        full_set_merge = False,
        recording_filter = {'sound.wav'}
    )
    am.read()

    anns = am.annotations[am.annotations['set'] == 'alice_vtc']
    assert anns.shape[0] == 1
    assert anns.iloc[0]['recording_filename'] == 'sound.wav'
    
    time.sleep(2) #sleeping for 2 seconds to have different 'imported_at' values so that can make sure both merge did fine
    
    am.merge_sets(
        left_set="vtc_rttm",
        right_set="alice",
        left_columns=["speaker_type"],
        right_columns=["phonemes", "syllables", "words"],
        output_set="alice_vtc",
        full_set_merge = False,
        skip_existing = True
    )
    am.read()
    
    anns = am.annotations[am.annotations['set'] == 'alice_vtc']
    assert anns.shape[0] == 2
    assert set(anns['recording_filename'].unique()) == {'sound.wav','sound2.wav'}
    assert anns.iloc[0]['imported_at'] != anns.iloc[1]['imported_at']
    
    segments = am.get_segments(am.annotations[am.annotations["set"] == "alice_vtc"])
    vtc_segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"])
    assert segments.shape[0] == vtc_segments.shape[0]
    assert segments.shape[1] == vtc_segments.shape[1] + 3

    adult_segments = (
        segments[segments["speaker_type"].isin(["FEM", "MAL"])]
        .sort_values(["segment_onset", "segment_offset"])
        .reset_index(drop=True)
    )
    alice = (
        am.get_segments(am.annotations[am.annotations["set"] == "alice"])
        .sort_values(["segment_onset", "segment_offset"])
        .reset_index(drop=True)
    )

    pd.testing.assert_frame_equal(
        adult_segments[["phonemes", "syllables", "words"]],
        alice[["phonemes", "syllables", "words"]],
    )
Example #7
0
    def run(
        self,
        destination: str,
        segments: str,
        eaf_type: str,
        template: str,
        context_onset: int = 0,
        context_offset: int = 0,
        path: str = None,
        import_speech_from: str = None,
        **kwargs,
    ):
        """generate .eaf templates based on intervals to code.

        :param path: project path
        :type path: str
        :param destination: eaf destination
        :type destination: str
        :param segments: path to the input segments dataframe
        :type segments: str
        :param eaf_type: eaf-type [random, periodic]
        :type eaf_type: str
        :param template: name of the template to use (basic, native, or non-native)
        :type template: str
        :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context
        :type context_onset: int
        :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context
        :type context_offset: int
        """

        try:
            from importlib import resources
        except ImportError:
            # TODO: Perhaps add this as a dependency to the resources?
            import importlib_resources as resources

        etf_path = "{}.etf".format(template)
        pfsx_path = "{}.pfsx".format(template)

        if template in ["basic", "native", "non-native"]:
            with resources.path("ChildProject.templates", etf_path) as etf:
                etf_path = str(etf)

            with resources.path("ChildProject.templates", pfsx_path) as pfsx:
                pfsx_path = str(pfsx)

        if not os.path.exists(etf_path):
            raise Exception("{} cannot be found".format(etf_path))

        if not os.path.exists(pfsx_path):
            raise Exception("{} cannot be found".format(pfsx_path))

        print("making the " + eaf_type + " eaf file and csv")

        segments = pd.read_csv(segments)

        assert_dataframe("segments", segments, not_empty=True)
        assert_columns_presence(
            "segments",
            segments,
            {"recording_filename", "segment_onset", "segment_offset"},
        )

        imported_set = None
        prefill = path and import_speech_from
        if prefill:
            project = ChildProject(path)
            am = AnnotationManager(project)
            am.read()
            imported_set = import_speech_from

        for recording_filename, segs in segments.groupby("recording_filename"):
            recording_prefix = os.path.splitext(recording_filename)[0]
            output_filename = (recording_prefix + "_" + eaf_type + "_" +
                               os.path.basename(template))

            # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here.
            timestamps = [(on, off) for on, off in
                          segs.loc[:,
                                   ["segment_onset", "segment_offset"]].values]

            speech_segments = None
            imported_format = None
            if prefill:
                ranges = segs.assign(
                    recording_filename=recording_filename).rename(
                        columns={
                            "segment_onset": "range_onset",
                            "segment_offset": "range_offset",
                        })
                matches = am.get_within_ranges(ranges, [import_speech_from],
                                               'warn')

                if len(matches) == 0:
                    continue

                speech_segments = am.get_segments(matches)
                try:
                    matches = matches["format"].drop_duplicates()
                    if len(matches.index) == 1:
                        imported_format = matches.iloc[0]
                except KeyError:
                    imported_format = None

            output_dir = os.path.join(destination, recording_prefix)

            create_eaf(
                etf_path,
                output_filename,
                output_dir,
                recording_filename,
                timestamps,
                eaf_type,
                context_onset,
                context_offset,
                template,
                speech_segments,
                imported_set,
                imported_format,
            )

            shutil.copy(
                pfsx_path,
                os.path.join(output_dir, "{}.pfsx".format(output_filename)))
Example #8
0
def test_periodic(project):
    """
    os.makedirs('output/eaf', exist_ok = True)

    project = ChildProject('examples/valid_raw_data')
    project.read()
    
    am = AnnotationManager(project)
    am.read()
    """
    data = pd.read_csv("tests/data/eaf_segments.csv")

    am = AnnotationManager(project)
    am.import_annotations(
        pd.DataFrame([{
            "set": "vtc",
            "raw_filename": "file.rttm",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 4000,
            "format": "vtc_rttm",
        }]),
        import_function=partial(fake_vocs, data),
    )

    sampler = PeriodicSampler(project, 500, 500, 250, recordings=['sound.wav'])
    sampler.sample()
    sampler.segments.to_csv('output/eaf/segments.csv')

    ranges = sampler.segments.rename(columns={
        "segment_onset": "range_onset",
        "segment_offset": "range_offset",
    })
    annotations = am.get_within_ranges(ranges, [IMP_FROM], 'warn')
    #annotations = am.annotations[am.annotations["set"] == IMP_FROM].drop_duplicates(['set', 'recording_filename', 'time_seek', 'range_onset', 'range_offset', 'raw_filename', 'format', 'filter'],ignore_index=True)
    annot_segments = am.get_segments(annotations)

    eaf_builder = EafBuilderPipeline()
    eaf_builder.run(
        destination='output/eaf',
        segments='output/eaf/segments.csv',
        eaf_type='periodic',
        template='basic',
        context_onset=250,
        context_offset=250,
        path='output/eaf',
        import_speech_from='vtc',
    )

    eaf = Eaf('output/eaf/sound/sound_periodic_basic.eaf')

    code = eaf.tiers['code_periodic'][0]
    segments = []

    for pid in code:
        (start_ts, end_ts, value, svg_ref) = code[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        sampler.segments[['segment_onset', 'segment_offset'
                          ]].sort_values(['segment_onset', 'segment_offset'
                                          ]).reset_index(drop=True))

    segments = []
    vtc_speech = eaf.tiers['VTC-SPEECH'][0]
    for pid in vtc_speech:
        (start_ts, end_ts, value, svg_ref) = vtc_speech[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    speech_segs = annot_segments[pd.isnull(annot_segments['speaker_type'])]

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        speech_segs[['segment_onset', 'segment_offset'
                     ]].sort_values(['segment_onset',
                                     'segment_offset']).reset_index(drop=True))

    segments = []
    vtc_chi = eaf.tiers['VTC-CHI'][0]
    for pid in vtc_chi:
        (start_ts, end_ts, value, svg_ref) = vtc_chi[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    chi_segs = annot_segments[annot_segments['speaker_type'] == 'CHI']

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        chi_segs[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True))

    segments = []
    vtc_och = eaf.tiers['VTC-OCH'][0]
    for pid in vtc_och:
        (start_ts, end_ts, value, svg_ref) = vtc_och[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    och_segs = annot_segments[annot_segments['speaker_type'] == 'OCH']

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        och_segs[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True))

    segments = []
    vtc_fem = eaf.tiers['VTC-FEM'][0]
    for pid in vtc_fem:
        (start_ts, end_ts, value, svg_ref) = vtc_fem[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    fem_segs = annot_segments[annot_segments['speaker_type'] == 'FEM']

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        fem_segs[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True))

    assert eaf.media_descriptors[0]['MEDIA_URL'] == 'sound.wav'
Example #9
0
    def extract_chunks(self,
                       destination,
                       path,
                       annotation_set='vtc',
                       batch_size=1000,
                       target_speaker_type='CHI',
                       sample_size=500,
                       chunk_length=500,
                       threads=0,
                       batches=0,
                       **kwargs):

        assert 1000 % chunk_length == 0, 'chunk_length should divide 1000'

        self.destination = destination
        self.project = ChildProject(path)

        batch_size = int(batch_size)
        sample_size = int(sample_size)
        chunk_length = int(chunk_length)
        threads = int(threads)

        self.sample_size = sample_size
        self.chunk_length = chunk_length

        am = AnnotationManager(self.project)
        self.annotations = am.annotations
        self.annotations = self.annotations[self.annotations['set'] ==
                                            annotation_set]
        self.segments = am.get_segments(self.annotations)
        self.segments = self.segments[self.segments['speaker_type'] ==
                                      target_speaker_type]
        self.segments['segment_onset'] = self.segments[
            'segment_onset'] + self.segments['time_seek']
        self.segments['segment_offset'] = self.segments[
            'segment_offset'] + self.segments['time_seek']

        destination_path = os.path.join(destination, 'chunks')
        os.makedirs(destination_path, exist_ok=True)
        if os.listdir(destination_path):
            raise ValueError(
                "destination '{}' is not empty, please choose another destination."
                .format(destination_path))

        segments = []
        for _recording, _segments in self.segments.groupby(
                'recording_filename'):
            segments.append(_segments.assign(recording_filename=_recording))

        pool = mp.Pool(threads if threads > 0 else mp.cpu_count())
        self.chunks = pool.map(self.split_recording, segments)
        self.chunks = itertools.chain.from_iterable(self.chunks)
        self.chunks = pd.DataFrame([{
            'recording':
            c.recording,
            'onset':
            c.onset,
            'offset':
            c.offset,
            'wav':
            c.getbasename('wav'),
            'mp3':
            c.getbasename('mp3'),
            'speaker_type':
            target_speaker_type,
            'date_extracted':
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'uploaded':
            False,
            'project_slug':
            '',
            'subject_set':
            '',
            'zooniverse_id':
            0
        } for c in self.chunks])

        # shuffle chunks so that they can't be joined back together
        # based on Zooniverse subject IDs
        self.chunks = self.chunks.sample(frac=1).reset_index(drop=True)
        self.chunks['batch'] = self.chunks.index.map(
            lambda x: int(x / batch_size))
        self.chunks.index.name = 'index'
        self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv'))