def encode_sentences(task, audios, batch_size=128): return numpy.vstack([ task.predict( torch.autograd.Variable(torch.from_numpy( vector_padder(batch))).cuda()).data.cpu().numpy() for batch in util.grouper(audios, batch_size) ])
def encode_sentences_SpeechText(task, audios, batch_size=128): def predict(x): return task.SpeechText.SpeechEncoderTop(task.SpeechText.SpeechEncoderBottom(x)) return numpy.vstack([ predict( torch.autograd.Variable(torch.from_numpy( vector_padder(batch))).cuda()).data.cpu().numpy() for batch in util.grouper(audios, batch_size) ])
def embed(net, audios, batch_size=32): """Return utterance embeddings for audio using the given net.""" device = next(net.parameters()).device out = [] for batch in util.grouper(audios, batch_size): for result in net.predict(torch.from_numpy(vector_padder(batch)).to(device)).cpu().numpy(): out.append(result) return np.stack(out)
def get_attn_weights(speech_transcriber, sentences): sent_audio = [get_audio(s) for s in sentences] sent_len = [sd.shape[0] for sd in sent_audio] v_audio = torch.from_numpy(sd.vector_padder(sent_audio, pad_end=True)).cuda() v_audio_len = torch.from_numpy(numpy.array(sent_len)).cuda() _, attn_weights = speech_transcriber(v_audio, v_audio_len) return attn_weights.numpy()
def transcribe(speech_transcriber, sentences): sent_audio = [get_audio(s) for s in sentences] sent_len = [sd.shape[0] for sd in sent_audio] v_audio = torch.from_numpy(sd.vector_padder(sent_audio, pad_end=True)).cuda() v_audio_len = torch.from_numpy(numpy.array(sent_len)).cuda() trn = speech_transcriber.predict(v_audio, v_audio_len) return trn
def iter_layer_states(model, audios, batch_size=128): """Pass audios through the model and for each audio return the state of each timestep and each layer.""" lens = (numpy.array(map(len, audios)) + model.config['filter_length']) // model.config['stride'] rs = (r for batch in util.grouper(audios, batch_size) for r in model.task.pile(vector_padder(batch))) for (r, l) in itertools.izip(rs, lens): yield r[-l:, :, :]
def encode_sentences(model, audios, batch_size=128): """Project audios to the joint space using model. For each audio returns a vector. """ return numpy.vstack([ model.task.predict( torch.autograd.Variable(torch.from_numpy( vector_padder(batch))).cuda()).data.cpu().numpy() for batch in util.grouper(audios, batch_size) ])
def get_state_stack(net, audios, batch_size=128): """Pass audios through the model and for each audio return the state of each timestep and each layer.""" device = next(net.parameters()).device result = [] lens = inout(np.array(list(map(len, audios)))) rs = (r for batch in util.grouper(audios, batch_size) for r in state_stack(net, torch.from_numpy(vector_padder(batch)).to(device)).cpu().numpy() ) for (r,l) in zip(rs, lens): result.append(r[-l:,:]) return result
def get_state_stack(net, audios, batch_size=128): import onion.util as util from vg.simple_data import vector_padder """Pass audios through the model and for each audio return the state of each timestep and each layer.""" result = [] lens = inout(np.array(list(map(len, audios)))) rs = (r for batch in util.grouper(audios, batch_size) for r in state_stack( net, torch.from_numpy(vector_padder(batch)).cuda()).cpu().numpy()) for (r, l) in zip(rs, lens): result.append(r[-l:, :]) return result