def no_laughter_present(t_files,start,end):
    for t_file in t_files:
        all_rows = get_text_from_file(t_file)
        for row in all_rows:
            region_start, region_end = extract_times_from_row(row)
            if audio_utils.times_overlap(float(region_start), float(region_end), float(start), float(end)):
                if 'laughter' in row.split()[-1]:
                    return False
    return True
def combine_overlapping_regions(regions_A, regions_B):
    all_regions = regions_A + regions_B
    overlap_found = True
    while(overlap_found):
        i = 0; j = 0
        overlap_found = False
        while i < len(all_regions):
            while j < len(all_regions):
                if i < j:
                    start1 = all_regions[i][0]; end1 = all_regions[i][1]
                    start2 = all_regions[j][0]; end2 = all_regions[j][1]
                    if audio_utils.times_overlap(start1, end1, start2, end2):
                        overlap_found = True
                        all_regions.pop(i); all_regions.pop(j-1)
                        all_regions.append((min(start1, start2), max(end1, end2)))
                j += 1
            i += 1
    return sorted(all_regions, key=lambda r: r[0])
Beispiel #3
0
def get_10_second_clips(regions_list,
                        audio_file_path,
                        full_audio_file_length,
                        index,
                        audioset_laughter_fraction,
                        adjustment_amount=0):
    if len(regions_list) == 0:
        return [], [], 0, 0
    # First pass to find clips
    all_clips = []
    current_start = None
    current_end = None
    for i in range(len(regions_list)):
        if current_start is None:
            current_start = regions_list[i][0]
            beginning_space = current_start
        if regions_list[i][0] + regions_list[i][1] > current_start + 10:
            all_clips.append({
                'window': [current_start, current_end],
                'beginning_buffer': 0.,
                'end_buffer': 0.,
                'beginning_space': beginning_space
            })
            current_start = regions_list[i][0]
            beginning_space = current_start - current_end  # new start point to old end point
        current_end = regions_list[i][0] + regions_list[i][1]
    if current_start is not None and current_end is not None:
        end_space = full_audio_file_length - current_end
        all_clips.append({
            'window': [current_start, current_end],
            'beginning_buffer': 0.,
            'end_buffer': 0.,
            'beginning_space': beginning_space,
            'end_space': end_space
        })

    for i, clip in enumerate(all_clips):
        if 'end_space' not in clip:
            clip['end_space'] = all_clips[i + 1]['beginning_space']
        #clip_len = clip['window'][1] - clip['window'][0]

    # 2nd pass: Go through, extending by 0.5 secs on each side unless it exceeds 10 seconds
    for i, clip in enumerate(all_clips):
        start, end = clip['window']
        length = end - start
        # Try adding 0.5s to begin and end, if not possible, print and give up
        time_to_add_per_side = 0.5
        # Try adding to beginning and end
        if time_to_add_per_side < clip[
                'beginning_space'] and time_to_add_per_side < clip['end_space']:
            clip['window'] = [
                start - time_to_add_per_side, end + time_to_add_per_side
            ]
            clip['beginning_space'] -= time_to_add_per_side
            clip['end_space'] -= time_to_add_per_side
            clip['beginning_buffer'] += 0.5
            clip['end_buffer'] += 0.5
            if i > 0:
                all_clips[i - 1]['end_space'] -= time_to_add_per_side
            if i < len(all_clips) - 1:
                all_clips[i + 1]['beginning_space'] -= time_to_add_per_side

    # 3rd pass: Go back through, centering and extending windows out to 10s
    for i, clip in enumerate(all_clips):
        start, end = clip['window']
        length = end - start
        # Try adding equally to begin and end, if not possible, try one side, if not possible, print and give up
        time_to_add = np.maximum(
            10 - length,
            0)  # If longer than 10 secs, don't shorten it, just leave it
        time_to_add_per_side = time_to_add / 2
        # Try adding to beginning and end
        if time_to_add_per_side < clip[
                'beginning_space'] and time_to_add_per_side < clip['end_space']:
            clip['window'] = [
                start - time_to_add_per_side, end + time_to_add_per_side
            ]
            clip['beginning_space'] -= time_to_add_per_side
            clip['end_space'] -= time_to_add_per_side
            clip['beginning_buffer'] += time_to_add_per_side
            clip['end_buffer'] += time_to_add_per_side
            if i > 0:
                all_clips[i - 1]['end_space'] -= time_to_add_per_side
            if i < len(all_clips) - 1:
                all_clips[i + 1]['beginning_space'] -= time_to_add_per_side
        elif time_to_add < clip['beginning_space']:
            clip['window'] = [start - time_to_add, end]
            clip['beginning_buffer'] += time_to_add
            if i > 0:
                all_clips[i - 1]['end_space'] -= time_to_add
        elif time_to_add < clip['end_space']:
            clip['window'] = [start, end + time_to_add]
            clip['end_buffer'] += time_to_add
            if i < len(all_clips) - 1:
                all_clips[i + 1]['beginning_space'] -= time_to_add
        else:
            pass
        if clip['beginning_buffer'] < 0 and clip['beginning_buffer'] > -0.1:
            clip['beginning_buffer'] = 0.
        if clip['end_buffer'] < 0 and clip['end_buffer'] > -0.1:
            clip['end_buffer'] = 0.

    # 4th pass: Compute the class-balance (laughter fraction) for this conversation
    total_window_time = sum(
        [clip['window'][1] - clip['window'][0] for clip in all_clips])
    total_laughter_time = sum([region[1] for region in regions_list])
    swb_laughter_fraction = total_laughter_time / total_window_time

    # Tweak this adjustment_amount to find a value for which after everything
    # The class balances match
    intended_window_time = total_laughter_time / (
        audioset_laughter_fraction) + adjustment_amount

    # 5th pass: Trim back the clips to match the class-balance distribution of the Audioset Annotations
    # Need to reduce the windows to cut 'total_window_time' down to 'intended_window_time'
    # Try to distribute the time so that all windows are close to the same size
    time_to_reduce = total_window_time - intended_window_time

    #available_time_per_clip = [clip['beginning_buffer']+clip['end_buffer'] for clip in clips]
    beginning_buffers = [clip['beginning_buffer'] for clip in all_clips]
    end_buffers = [clip['end_buffer'] for clip in all_clips]
    all_buffers = beginning_buffers + end_buffers
    time_to_reduce_per_buffer = distribute_time(time_to_reduce, all_buffers)
    beginning_buffer_updates, end_buffer_updates = np.split(
        time_to_reduce_per_buffer, 2)

    try:
        for i, clip in enumerate(all_clips):
            assert (clip['beginning_buffer'] >= 0)
            assert (clip['end_buffer'] >= 0)
    except:
        pass
        #import pdb; pdb.set_trace()

    try:
        assert (len(beginning_buffer_updates) == len(all_clips))
        assert (len(end_buffer_updates) == len(all_clips))
    except:
        pass
        #import pdb; pdb.set_trace()

    for i, clip in enumerate(all_clips):
        clip['window'][0] += beginning_buffer_updates[i]
        clip['beginning_space'] += beginning_buffer_updates[i]
        clip['beginning_buffer'] -= beginning_buffer_updates[i]
        clip['window'][1] -= end_buffer_updates[i]
        clip['end_space'] += end_buffer_updates[i]
        clip['end_buffer'] -= end_buffer_updates[i]
        if clip['beginning_buffer'] < 0 and clip['beginning_buffer'] > -0.1:
            clip['beginning_buffer'] = 0.
        if clip['end_buffer'] < 0 and clip['end_buffer'] > -0.1:
            clip['end_buffer'] = 0.

        try:
            assert (clip['beginning_buffer'] >= 0)
            assert (clip['end_buffer'] >= 0)
        except:
            pass
            #import pdb; pdb.set_trace()

    # 6th pass: Re-Compute the class-balance (laughter fraction) for this conversation
    total_window_time = sum(
        [clip['window'][1] - clip['window'][0] for clip in all_clips])
    total_laughter_time = sum([region[1] for region in regions_list])
    swb_laughter_fraction = total_laughter_time / total_window_time
    intended_window_time = total_laughter_time / audioset_laughter_fraction

    # Now make the dataframe
    rows = []
    # For each window, grab each laughter region that's inside it and mark that relative to the window start
    for i, clip in enumerate(all_clips):
        inside_regions = [
            r for r in regions_list if audio_utils.times_overlap(
                clip['window'][0], clip['window'][1], r[0], r[0] + r[1])
        ]

        if len(inside_regions) > 5:
            pass
            #import pdb; pdb.set_trace()

        h = {
            'FileID': audio_file_path.split('/')[-1].split('.')[0],
            'audio_path': audio_file_path,
            'audio_length': full_audio_file_length,
            'window_start': clip['window'][0],
            'window_length': clip['window'][1] - clip['window'][0]
        }

        for j in range(5):
            if j == 0:
                start_key = 'Start'
                end_key = 'End'
            else:
                start_key = f'Start.{j}'
                end_key = f'End.{j}'
            if len(inside_regions) > j:
                r = inside_regions[j]
                h[start_key] = r[0]
                h[end_key] = r[0] + r[1]
            else:
                h[start_key] = np.nan
                h[end_key] = np.nan

        if h['window_length'] > 1.:
            rows.append(h)

    return rows, all_clips, total_laughter_time, total_window_time