def __call__(self, mspec, lseg, difflen = 0): """ *** input * mspec: mel spectrogram * lseg: list of tuples (label, start, stop) corresponding to previous segmentations * difflen: 0 if the original length of the mel spectrogram is >= 68 otherwise it is set to 68 - length(mspec) *** output a list of adjacent tuples (label, start, stop) """ if self.nmel < 24: mspec = mspec[:, :self.nmel].copy() patches, finite = _get_patches(mspec, 68, 2) if difflen > 0: patches = patches[:-int(difflen / 2), :, :] finite = finite[:-int(difflen / 2)] assert len(finite) == len(patches), (len(patches), len(finite)) ret = [] for lab, start, stop in lseg: if lab != self.inlabel: ret.append((lab, start, stop)) continue rawpred = self.nn.predict(patches[start:stop, :]) rawpred[finite[start:stop] == False, :] = 0.5 pred = viterbi_decoding(np.log(rawpred), diag_trans_exp(self.viterbi_arg, len(self.outlabels))) for lab2, start2, stop2 in _binidx2seglist(pred): ret.append((self.outlabels[int(lab2)], start2+start, stop2+start)) return ret
def _gender(nn, patches, finite_patches, speechzicseg): ret = [] for lab, start, stop in speechzicseg: if lab in ['Music', 'NOACTIVITY']: # no energy ret.append((lab, start, stop)) continue rawpred = nn.predict(patches[start:stop, :]) rawpred[finite_patches[start:stop] == False, :] = 0.5 pred = viterbi_decoding(np.log(rawpred), log_trans_exp(80)) for lab2, start2, stop2 in _binidx2seglist(pred): ret.append((['Female', 'Male'][int(lab2)], start2 + start, stop2 + start)) return ret
def __call__(self, mspec, lseg, difflen = 0): """ *** input * mspec: mel spectrogram * lseg: list of tuples (label, start, stop) corresponding to previous segmentations * difflen: 0 if the original length of the mel spectrogram is >= 68 otherwise it is set to 68 - length(mspec) *** output a list of adjacent tuples (label, start, stop) """ if self.nmel < 24: mspec = mspec[:, :self.nmel].copy() patches, finite = _get_patches(mspec, 68, 2) if difflen > 0: patches = patches[:-int(difflen / 2), :, :] finite = finite[:-int(difflen / 2)] assert len(finite) == len(patches), (len(patches), len(finite)) batch = [] for lab, start, stop in lseg: if lab == self.inlabel: batch.append(patches[start:stop, :]) if len(batch) > 0: batch = np.concatenate(batch) with graph.as_default(): with tf.Session() as sess: sess.run(tf.initialize_all_variables()) rawpred = self.nn.predict(batch, batch_size=self.batch_size) ret = [] for lab, start, stop in lseg: if lab != self.inlabel: ret.append((lab, start, stop)) continue l = stop - start r = rawpred[:l] rawpred = rawpred[l:] r[finite[start:stop] == False, :] = 0.5 pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels))) for lab2, start2, stop2 in _binidx2seglist(pred): ret.append((self.outlabels[int(lab2)], start2+start, stop2+start)) return ret
def _speechzic(nn, patches, finite_patches, vad): ret = [] for lab, start, stop in _binidx2seglist(vad): if lab == 0: # no energy ret.append(('NOACTIVITY', start, stop)) continue #print(start, stop) rawpred = nn.predict(patches[start:stop, :]) rawpred[finite_patches[start:stop] == False, :] = 0.5 pred = viterbi_decoding(np.log(rawpred), log_trans_exp(150)) for lab2, start2, stop2 in _binidx2seglist(pred): ret.append((['Speech', 'Music'][int(lab2)], start2 + start, stop2 + start)) return ret
def _energy_activity(loge, ratio=0.03): threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio) raw_activity = (loge > threshold) return viterbi_decoding(pred2logemission(raw_activity), log_trans_exp(150, cost0=-5))