Ejemplo n.º 1
0
def _detect_in_audio(wavdata,
                     sr,
                     model=None,
                     precision=3,
                     algorithms=['threshold'],
                     params=dict(n=5, t=.8),
                     verbose=True,
                     norm=False,
                     name='.noname'):
    '''
    This internal method is meant to tie together the two methods before it,
    `score_continuous_data`, and `decode_sequence`. The method takes in raw
    wave audio data, and does the heavylifting in detecting the sampling rate,
    assigning labels, and so on.
    ---
        wavdata:
        model: an instance of Keras BaseModel class that supports model.predict
               in order to assign multiclass/binary probabilities to data
        precision: the higher, the more precise (time-wise), and the slower it
                   takes to compute initially
        algorithms: a list of algorithms to use for scoring. results from using
                    all of the specified algorithms are returned
        params: dict; params specific to the algorithms requested. e.g., for
                threshold, n (window len) and t (threshold) may be supplied

        return: (decoded, preds), a tuple of dict with dict['algorithm'] storing
                the assigned labels according to 'algorithm', and preds storing
                the raw output from model.predict(), should the parent method
                need it for anything
    '''
    preds = score_continuous_data(wavdata=wavdata,
                                  sr=sr,
                                  model=model,
                                  precision=precision,
                                  norm=norm,
                                  name=name)

    decoded = defaultdict(list)
    for alg in algorithms:
        color.INFO('INFO', 'decoding labels with {}'.format(alg))
        try:
            decoded[alg] = decode_sequence(probs=preds,
                                           algorithm=alg,
                                           params=params,
                                           verbose=verbose)
        except NotImplementedError:
            color.INFO('FUTURE', 'WIP; {} not yet implemented'.format(alg))

    decoded['timestamp'] = [
        int(i * (.96e3 / precision)) for i, _ in enumerate(preds)
    ]

    return decoded, preds
Ejemplo n.º 2
0
def export_path():
    '''
    adds directory with vggish-related scripts to system path
    in the current session so that they can be imported with ease
    '''
    cwd = Path.cwd()
    repo = cwd.parent
    for child in ['vggish', 'utils']:
        childpath = repo.joinpath(child)
        if childpath.exists():
            sys.path.append(childpath)
            color.INFO('INFO', 'added "{}" to system path'.format(childpath))
        else:
            color.INFO('INFO', 'skipped "{}": did not exist'.format(childpath))
Ejemplo n.º 3
0
def _overlay_video(cap, metadata, preds, writer, precision=2, skip=0):
    '''
    the most basic video overlay method. needs a capture object, an array of
    predictions, and a precision level (with respect to samples per 0.96s
    chunks) in order to add an overlay to a video. method is internal because
    wrappers to initialize a capture object and make predictions are available.
    '''
    fps = metadata['fps']
    audioframe_dur = .96 / precision

    color.INFO('INFO', 'applying overlay to individual frames')
    for i in progressbar(range(metadata['frames']), redirect_stdout=1):

        time = i / fps
        predindex = time / audioframe_dur
        predindex = int(predindex)

        flag, frame = cap.read()

        if i % (skip + 1) == 0:
            try:
                new_frame = overlay_frame(frame,
                                          height=metadata['height'],
                                          width=metadata['width'],
                                          preds=preds,
                                          index=predindex)
            except IndexError:
                break

        writer.write(new_frame)
Ejemplo n.º 4
0
def build_laugh_model(*args, **params):
    '''
    '''
    color.INFO(
        'WARNING', 'this function is left here for compatibility. '
        'please use `build_dense_model` in your methods. '
        'this function will be deprecated in the future.')
    return build_dense_model(*args, **params)
Ejemplo n.º 5
0
def _binary_probs_to_multiclass(binary_probs=None):
    '''
    Helper method to convert an array of binary probabilities to multiclass
    probabilities. This is necessary because a multiclass probabilities array
    specifies a probability for each class, whereas, a binary array
    '''
    assert binary_probs.shape[-1] == 1, 'badly shaped binary probabilities'
    color.INFO('INFO', 'converting binary probs array to multiclass')
    multi = [np.array([1 - x, x]) for x in binary_probs]
    return np.array(multi).reshape(-1, 2)
Ejemplo n.º 6
0
def _get_data_spectro(which_episodes,
                      preserve_length=False,
                      archive='../data/archive',
                      task='laughter',
                      windowlen=100):
    '''
    gets spectrograph data for an episode by calling
    soundutils.get_data_spectro on chunks, repeatedly
    '''
    if type(which_episodes) == str:
        which_episodes = [which_episodes]

    X, Y, refs = [], [], []
    for ep in which_episodes:
        laughs, nolaughs = get_patches(ep, task)
        sr, wavdata = wavfile.read('../wav/{}.wav'.format(ep))

        for label in {0, 1}:

            color.INFO('INFO',
                       'processing data for ep={}; label={}'.format(ep, label))

            for start, end in progressbar([nolaughs, laughs][label],
                                          redirect_stdout=True):
                if start + windowlen >= end: continue
                start_f, end_f = _convertref(start, sr), _convertref(end, sr)

                f, t, samples = soundutils.get_data_spectro(
                    wavdata[start_f:end_f], sr, windowlen=windowlen)
                if preserve_length:
                    X.append(samples)
                    Y.append(label)
                    refs.append((ep, start, end))
                else:
                    X += [s for s in samples]
                    Y += [label for _ in samples]
                    refs += [(ep, start, end) for _ in samples]

    X = np.vstack(X)  #.reshape(*X.shape, 1)
    Y = np.vstack(Y)
    return X, Y, refs
Ejemplo n.º 7
0
def decode_sequence(probs=None,
                    algorithm='threshold',
                    params=dict(n=5, t=.8),
                    verbose=True):
    '''
    Once a model outputs probabilities for some sequence of data, that
    data shall be passed to this method. This method will use various
    ways to decode an underlying sequence in order to determine where
    the *actual* canned laughter was.
    possible algorithms to decode sequence:
        - 'neural'
          surround-n-gram neural network: this method will use a pretrained
          Keras model to label some sample i using the multiclass probabilities
          of all of the samples numbered [i-n, i-n+1, ... i, i+1, ..., i+n],
          i.e., n before and n afterwards.
        - 'hmm'
          HMM: this method will use a hidden Markov model with underlying
               states that are the same as surface states (the two state spaces
               for hidden and observed are equivalent).
               uses Viterbi to decode the underlying state sequence.
               requires a params to be passed as dict(c=DiscreteDistribution)
               where c is a class (label) and DiscreteDistribution is an
               instance of emission probabilities created using `pomegranate`,
               for each such class c (0, 1, 2, ...)
        - 'threshold'
          window and threshold method: this is simple heuristic-based method
          that will observe windows of length n, and if the average probability
          of any single class is at least t, it will assign that same
          class to all of the samples in that window. imagine a threshold of
          0.9, then it is intuitively likely if few of the samples are labeled
          with some other class, they may have been accidentally so-labeled.
        - 'modethreshold'
          like 'threshold' but instead of considering avg probability, it
          considers what percentage of labels are a particular class and if
          that surpasses a threshold, then all labels are made that same label
    ---
        probs: an nparray of (n_samples, n_classes) probabilities such that
               foreach sample, the sum of probabilities across classes adds up
               to 1. In case supplied array is of shape (n_samples,) it will be
               converted to multiclass using this module's
               _binary_probs_to_multiclass method

        return: a list of len n_samples, with the ith sample being the
                predicted label of that sample. this prediction would usually
                also incorporate somehow the samples before and after the
                current sample
    '''
    color.INFO('INFO', 'shape of input probs is: {}'.format(probs.shape))
    if probs.shape[-1] == 1:
        probs = _binary_probs_to_multiclass(probs)

    color.INFO('INFO', 'received probs of shape {}'.format(str(probs.shape)))
    if algorithm == 'threshold':
        n, t = params['n'], params['t']
        labels = [np.argmax(timechunk) for timechunk in probs]

        for i in range(len(probs) - n + 1):
            # print(np.average(probs[i:i+n], axis=0)[0],
            #       np.average(probs[i:i+n], axis=0)[1])
            for c in range(probs.shape[-1]):
                avg = np.average(probs[i:i + n], axis=0)[c]
                if avg >= t:
                    # color.INFO('DEBUG',
                    #            'found threshold window of {} at [{}:{}] for class {}'.format(avg, i, i+n, c))
                    labels[i:i + n] = [c for _ in range(n)]

        return labels

    elif algorithm == 'hmm' or algorithm == 'viterbi':
        # define default emission probabilities
        default = {
            0: pmgt.DiscreteDistribution({
                '0': 0.7,
                '1': 0.3
            }),
            1: pmgt.DiscreteDistribution({
                '0': 0.2,
                '1': 0.8
            })
        }

        states = []
        for c in [*range(probs.shape[-1])]:
            state = pmgt.State(params.get(c, default[c]), name=str(c))
            states += [state]

        model = pmgt.HiddenMarkovModel('laugh-decoder')
        model.add_states(states)

        if 'transitions' in params:
            model.add_transitions(params['transitions'])
        else:
            # start must always go to state 0
            model.add_transitions([model.start, states[0]],
                                  [states[0], model.end], [1., .1])
            model.add_transitions([states[0], states[0], states[1], states[1]],
                                  [states[0], states[1], states[0], states[1]],
                                  [.5, .4, .2, .8])
        model.bake()

        # if verbose:
        #     model.plot() # plotting is weird

        labels = [str(np.argmax(entry)) for entry in probs]
        labels = model.predict(sequence=labels, algorithm='viterbi')
        return labels[1:-1]

    else:
        raise NotImplementedError
Ejemplo n.º 8
0
def score_continuous_data(wavdata=None,
                          sr=None,
                          model=None,
                          precision=3,
                          L=1,
                          archive='../data/archive',
                          name='.noname',
                          norm=False):
    '''
    Given wavdata of an audio signal and its sampling rate, this method
    will generate more embeddings for the same data than are typically needed
    for training, in order to get a better estimate of where in the audio
    there's canned laughter.
    ---
        wavdata: raw wavdata of an audio signal; typically, an entire episode
        sr: audio sampling rate (frames per second; Hz)
        model: the model to used to assign probability scores to an embedding.
               must be an instance of the Keras Model or BaseModel class
               and support prediction of data.
        precision: number of embeddings to generate, each with an equally
                   spaced-out offset less than 0.96s so that each embedding is
                   generated over a unique time interval. this number will also
                   determine the precision of the labeling, to the nearest
                   (.96/precision) seconds. note than generating an embedding
                   for each precision-point takes time and memory, so high
                   precision and memory-and-time constraints need to be
                   balanced for an optimal level of precision (default: 3;
                   min: 1).
        L: the length of the sequence the model accepts to make predictions
           about labels. (defaut: 1) [WIP; not implemented]. any value other
           than 1 would result in an Exception.

        return: outputs a (len(wavdata)*precision/(sr*.96-L), n_classes) shaped
                array of labels predicted by the model supplied

    '''
    offsets = np.arange(0, 0.96, 0.96 / precision)

    archivepath = Path(archive)
    archivepath = archivepath.joinpath(name + '_emb_prec=%d.npz' % precision)

    if archivepath.exists():
        color.INFO('INFO', 'loading archived data from {}'.format(archivepath))
        data = np.load(archivepath)
        embs = data['embs']  #if precision > 1 else data['embs']
    else:
        color.INFO('INFO', 'no archived data found at {}'.format(archivepath))
        embs = []
        for x in offsets:
            start_f = int(sr * x)
            color.INFO(
                'INFO', 'computing embedding for offset {}; '
                'this may take a while'.format(x))

            emb, utils.sess = get_embed(input_wav=wavdata[start_f:],
                                        sr=sr,
                                        sess=utils.sess)
            if norm:
                emb = normalize(np.stack(emb, axis=0))
            embs.append(emb)
        np.savez_compressed(archivepath, embs=np.array(embs))

    color.INFO('INFO', 'unpacking offset embeddings into single list')
    sequence = [*sum(zip(*embs), ())]  #if precision > 1 else np.vstack(embs)

    color.INFO('INFO', 'making predictions')
    preds = []
    for item in sequence:
        # print(item.shape)
        pred = model.predict(x=item.reshape(1, -1))
        preds.append(pred)

    return np.vstack(preds)
Ejemplo n.º 9
0
def _get_data_vggish(which_episodes=None,
                     preserve_length=False,
                     archive='../data/archive',
                     task='laughter'):
    '''
    gets embeddings data for a list of episodes.
    as a list, expects basenames of the episodes without any attribute or
    filename extension.
    returns class data generated using patches loaded for the same episodes
    ---
        which_episodes: a list of basenames (e.g. "friends-s02-e03") of
                        episodes to process (required)
                        or the name of a single episode
        use_vggish: whether to use vggish generating data (currently no
                    alternate method is implemented, but will be in the future)
        preserve_length: whether to return data as disjoint chunks of equal
                         length, or preserve length of annotated chunks and
                         return variable-length data (defaut: False)
        archive: directory housing memoized archives of the individual episodes
                 data for a task so that they don't have to be recomputed each
                 time. default: '../data/archive'. If an empty string or None
                 or anything that evaluates to False is passed, will not
                 archive the data for this run.

        return: X, Y of shape (n_samples, n_features) when not preserve_length
                              (n_samples, maxlen, n_features) when preserving
                the number of samples in these two cases would be different
                refs, a list of where each sample is from in the original ep
    '''
    if type(which_episodes) is str:
        which_episodes = [which_episodes]

    color.INFO('INFO', 'processing episodes {}'.format(str(which_episodes)))

    X, Y, refs = [], [], []

    for ep in which_episodes:
        color.INFO('INFO', 'processing {}'.format(ep))

        laughs, nolaughs = get_patches(ep, task)

        existsflag = False
        archivepath = Path(archive)
        archivepath = archivepath.joinpath(ep + '_%s_datachunks.npz' % task)

        if archive:
            # check if archives already exist
            if archivepath.exists():
                color.INFO('INFO', 'loading from {}'.format(archivepath))
                existsflag = True
                arrays = np.load(archivepath)
                this_X = arrays['X'].tolist()
                this_Y = arrays['Y'].tolist()
                this_refs = arrays['refs'].tolist()
            else:
                this_X, this_Y, this_refs = [], [], []
                sr, wavdata = wavfile.read('../wav/{}.wav'.format(ep))
        else:
            sr, wavdata = wavfile.read('../wav/{}.wav'.format(ep))

        color.INFO('INFO', 'processing %s data in %s' % (task, ep))
        if not existsflag:
            for start, end in progressbar(laughs, redirect_stdout=False):
                if existsflag: break
                if start == end: continue
                start_f, end_f = _convertref(start, sr), _convertref(end, sr)
                # print(start_f, end_f)
                try:
                    this_x,
                    utils.sess = get_embed(input_wav=wavdata[start_f:end_f],
                                           sr=sr,
                                           sess=utils.sess)
                    if preserve_length:
                        this_X += [this_x]
                        this_Y += [1]
                        this_refs += [(ep, start, end)]
                    else:
                        this_X += [x.reshape(1, -1) for x in this_x]
                        this_Y += [1 for _ in this_x]
                        this_refs += [(ep, start, end) for _ in this_x]
                # except (tf.errors.InvalidArgumentError, Exception) as e:
                except Exception as e:
                    color.ERR('INFO',
                              'encountered {}; resuming...\r'.format(e))
                    pass

        color.INFO('INFO', 'processing no-%s data in %s' % (task, ep))
        if not existsflag:
            for start, end in progressbar(nolaughs, redirect_stdout=True):
                if start == end: continue
                start_f, end_f = _convertref(start, sr), _convertref(end, sr)
                # print(start_f, end_f)
                try:
                    this_x, utils.sess = get_embed(input_wav=\
                                                   wavdata[start_f:end_f],
                                                   sr=sr, sess=utils.sess)
                    if preserve_length:
                        this_X += [this_x]
                        this_Y += [0]
                        this_refs += [(ep, start, end)]
                    else:
                        this_X += [x.reshape(1, -1) for x in this_x]
                        this_Y += [0 for _ in this_x]
                        this_refs += [(ep, start, end) for _ in this_x]
                # except (tf.errors.InvalidArgumentError, Exception) as e:
                except Exception as e:
                    color.ERR('INFO',
                              'encountered {}; resuming...\r'.format(e))
                    pass

        X += this_X
        Y += this_Y
        refs += this_refs

        this_X = np.vstack(this_X)
        this_Y = np.array(this_Y, dtype=int)
        this_refs = np.array(this_refs, dtype=object)

        if archive and not existsflag:
            np.savez_compressed(archivepath,
                                X=this_X,
                                Y=this_Y,
                                refs=this_refs)

        del this_X
        del this_Y
        del this_refs

    if preserve_length:
        return X, Y, refs
    else:
        return np.vstack(X), np.array(Y, dtype=int), np.array(refs,
                                                              dtype=object)
Ejemplo n.º 10
0
                            '--skip',
                            type=int,
                            help='skip these many frames for speedup',
                            default=0)

    config = arg_parser.parse_args()

    inp = Path('../video').joinpath(config.episode + '_preds-overlay' + '.mp4')
    aud = Path('../wav').joinpath(config.episode + '.wav')
    out = Path('../video').joinpath(config.episode + \
                                    '_preds-overlay_with-audio' + '.mp4')

    try:
        if config.overlay > 0:
            model = modelbuilder.build_laugh_model()
            model.load_weights(
                filepath='../laughter/task:per-season-split-ckpt.hdf5')
            model = modelbuilder._compile_binary(model)
            overlay_episode(config.episode, model, skip=config.skip)
    except KeyboardInterrupt:
        pass

    if config.audio > 0:  # -vcodec copy -acodec copy
        cmd = 'ffmpeg -i {} -i {} -c:v libx264 -c:a libvorbis -shortest {}'
        # cmd = 'ffmpeg -i {} -i {} -c:v mpeg4 -c:a libvorbis -shortest {}'
        cmd = cmd.format(str(inp), str(aud), str(out))
        try:
            subprocess.run(cmd.split(' '))
        except TypeError:
            color.INFO('DEBUG', 'catching a TypeError for some reason?')