def match_to_phase(self, reconst, phase):
        width = reconst.shape[2]

        reconst_exp = conversions.from_log(
            reconst, options=self.conv_options,
            make_pos_fn=lambda val: val.clamp(min=0.0),
            exp_fn=torch.exp)

        for _ in range(self.iters):
            samples = self.stft.inverse(reconst_exp, phase)
            reconst_exp, _ = self.stft.transform(samples)
            reconst_exp = reconst_exp[:, :, :width]

        reconst = conversions.to_log(
            reconst_exp.clamp(min=0.0), options=self.conv_options,
            log_fn=torch.log)

        return reconst
Ejemplo n.º 2
0
    def __init__(self,
                 data_path,
                 example_tensor,
                 features,
                 get_phase=False,
                 num_speakers=100,
                 speaker_start_index=0,
                 speaker_take_count=40,
                 utterance_take_count=24):
        self.data_path = data_path
        self.example_tensor = example_tensor

        with open(os.path.join(data_path, 'conv_options.pkl'), 'rb') as f:
            self.conv_options = pickle.load(f)

        self.direct_feature_extractor = lambda mag_frames: \
            to_torch(mag_frames, example_tensor)

        if features == 'direct':
            self.feature_extractor = self.direct_feature_extractor
        elif features == 'log':
            self.feature_extractor = lambda mag_frames: \
                to_torch(conversions.to_log(
                    mag_frames, self.conv_options), example_tensor)
        elif features == 'mag_norm':
            band_mags = np.load(os.path.join(data_path, 'band_mags.npy'))
            self.feature_extractor = lambda mag_frames: \
                to_torch(conversions.to_mag_norm(
                    mag_frames, band_mags, self.conv_options), example_tensor)
        elif features == 'two':
            band_mags = np.load(os.path.join(data_path, 'band_mags.npy'))
            self.feature_extractor = lambda mag_frames: \
                to_torch(conversions.to_two(
                    mag_frames, band_mags, self.conv_options), example_tensor)
        else:
            raise RuntimeError("Invalid feature type: " + features)

        self.get_phase = get_phase
        self.num_speakers = num_speakers
        self.speaker_start_index = speaker_start_index
        self.speaker_take_count = speaker_take_count
        self.utterance_take_count = utterance_take_count
Ejemplo n.º 3
0
        DATA_PATH, "speech_" + str(speaker) + ".npy")
    speech = np.load(path)

    path = os.path.join(
        DATA_PATH, "sizes_" + str(speaker) + ".npy")
    sizes = np.load(path)

    num_utterances = sizes.shape[0]
    indices = np.concatenate([[0], np.cumsum(sizes)])

    encoded = example_tensor.new_zeros(num_utterances, ENCODED_DIM)
    for utterance in range(num_utterances):
        start_index = indices[utterance]
        end_index = indices[utterance + 1]

        value = to_torch(conversions.to_log(
            speech[start_index:end_index], conv_options), example_tensor)

        encoded[utterance] = encoder(value)[0].detach()

    eligible_set = torch.arange(num_utterances)
    while eligible_set.size()[0] > 1:
        num_eligible = eligible_set.size()[0]
        encoded_subset = encoded[eligible_set]

        encoded_mean = encoded_subset.mean(dim=0, keepdim=True)
        sq_distances = ((encoded_subset - encoded_mean) ** 2).sum(dim=1)
        _, best_indices = torch.topk(
            sq_distances, num_eligible // 2, largest=False)
        eligible_set = eligible_set[best_indices]

    center = encoded[eligible_set[0]].unsqueeze(0)
Ejemplo n.º 4
0
    encoder = encoder.cuda()

result = torch.zeros(1, ENCODED_DIM)

files = [f for f in os.listdir(INPUT_PATH) if f.find('.wav') != -1]
encoded = example_tensor.new_zeros(len(files), ENCODED_DIM)

print("Found %d files" % len(files))

for (i, file) in enumerate(files):
    path = os.path.join(INPUT_PATH, file)
    speech, _ = conversions.encode(SAMPLE_RATE,
                                   conversions.load_wav(path, SAMPLE_RATE),
                                   conv_options)

    value = to_torch(conversions.to_log(speech, conv_options), example_tensor)
    encoded[i] = encoder(value).detach()

eligible_set = torch.arange(len(files))
while eligible_set.size()[0] > 1:
    num_eligible = eligible_set.size()[0]
    encoded_subset = encoded[eligible_set]

    encoded_mean = encoded_subset.mean(dim=0, keepdim=True)
    sq_distances = ((encoded_subset - encoded_mean)**2).sum(dim=1)
    _, best_indices = torch.topk(sq_distances,
                                 num_eligible // 2,
                                 largest=False)
    eligible_set = eligible_set[best_indices]

result = encoded[eligible_set[0]].unsqueeze(0).cpu()