Ejemplo n.º 1
0
word_num = section["word_num"]
letter_num = section["letter_num"]
print("Done!")

#%%
print("Loading results....")
word_results = np.load(results_dir / "word_stateseq.npz")
letter_results = np.load(results_dir / "letter_stateseq.npz")
duration_results = np.load(results_dir / "word_durations.npz")
keys = sorted(list(word_results.keys()))
train_iter = word_results[keys[0]].shape[0]

if args.speaker_id is not None:
    speaker, spkind_keys = separate_speaker(np.load(args.speaker_id))
    speaker_N = len(speaker)
    spkind_phn_labels = get_separated_values(phn_labels, spkind_keys)
    spkind_wrd_labels = get_separated_values(wrd_labels, spkind_keys)
    spkind_letter_results = get_separated_values(letter_results, spkind_keys)
    spkind_word_results = get_separated_values(word_results, spkind_keys)

    spkind_letter_ARI = np.zeros((speaker_N, train_iter))
    spkind_letter_macro_f1_score = np.zeros((speaker_N, train_iter))
    spkind_letter_micro_f1_score = np.zeros((speaker_N, train_iter))
    spkind_word_ARI = np.zeros((speaker_N, train_iter))
    spkind_word_macro_f1_score = np.zeros((speaker_N, train_iter))
    spkind_word_micro_f1_score = np.zeros((speaker_N, train_iter))

    spkind_letter_confusion_matrix = np.zeros(
        (speaker_N, train_iter, phn_label_N, letter_num), dtype=int)
    spkind_word_confusion_matrix = np.zeros(
        (speaker_N, train_iter, wrd_label_N, word_num), dtype=int)
Ejemplo n.º 2
0
gen_path = args.generator or (args.snapshot_dir / args.snapshot_name).with_suffix(gen_suffix)
dis_path = args.discriminator or (args.snapshot_dir / args.snapshot_name).with_suffix(dis_suffix)
cls_path = args.classifier or (args.snapshot_dir / args.snapshot_name).with_suffix(cls_suffix)

# Set up model
num_mels = 36
zdim = 5
hdim = 32
cdim = 8
adim = 32

speakers, speaker_individual_keys = separate_speaker(np.load(args.speaker_id))
speaker_num = len(speakers)
identity = np.identity(speaker_num, dtype=np.float32)

spkind_mcep = get_separated_values(np.load(args.mcep), speaker_individual_keys)
spkind_f0 = get_separated_values(np.load(args.f0), speaker_individual_keys)
spkind_ap = get_separated_values(np.load(args.ap), speaker_individual_keys)

mcep_mean = np.load(args.mcep_norm_param[0])
mcep_std = np.load(args.mcep_norm_param[1])
logf0_mean = np.load(args.logf0_norm_param[0])
logf0_std = np.load(args.logf0_norm_param[1])

generator = generator_class(speaker_num)
adverserial_discriminator = discriminator_class(num_mels, speaker_num, adim)

serializers.load_npz(gen_path, generator)
serializers.load_npz(dis_path, adverserial_discriminator)

spkind_kmfa = [speaker_individual_keys, spkind_mcep, spkind_f0, spkind_ap]
Ejemplo n.º 3
0
parser.add_argument("--size", type=int, default=1)

parser.add_argument("--mode", choices=["ML", "RND"], default="ML")

parser.add_argument("--LM", choices=["LSTM", "Bigram", "Unigram"])
parser.add_argument("--unique", action="store_true")

parser.add_argument("--LSTM_model", type=Path)

args = parser.parse_args()

speakers, spkind_keys = separate_speaker(np.load(args.speaker_id))
speaker_num = len(speakers)

target_idx = speakers.index(args.target_speaker)
src_letter_stateseq = get_separated_values(np.load(args.letter_stateseq),
                                           spkind_keys)[target_idx]
src_f0 = get_separated_values(np.load(args.f0), spkind_keys)[target_idx]
src_ap = get_separated_values(np.load(args.ap), spkind_keys)[target_idx]
mcep_min = np.load(args.mcep_norm_param[0])
mcep_max = np.load(args.mcep_norm_param[1])

if args.sentences is None:
    if args.LM == "Unigram":
        snt_generator = Unigram_generator(args.sentences_file)
    elif args.LM == "Bigram":
        snt_generator = Bigram_generator(args.sentences_file, args.parameter)
    elif args.LM == "LSTM":
        snt_generator = LSTMLM_generator(args.LSTM_model, args.sentences_file)

ap_generator = AP_generator(args.letter_num,
                            src_ap,
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='Train stargan voice convertor')
    parser.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument("--train_data",
                        type=Path,
                        required=True,
                        help="training data")
    parser.add_argument("--speaker_id",
                        type=Path,
                        required=True,
                        help="speaker_id file")
    parser.add_argument("--output_file", type=Path, required=True)
    parser.add_argument('--epoch',
                        default=6000,
                        type=int,
                        help='number of epochs to learn')
    parser.add_argument("--epoch_start", type=int, default=0)

    parser.add_argument('--snapshot',
                        default=100,
                        type=int,
                        help='interval of snapshot')
    parser.add_argument('--batchsize', type=int, default=4, help='Batch size')
    parser.add_argument('--optimizer',
                        default='Adam',
                        choices=["Adam", "MomentumSGD", "RMSprop"],
                        type=str,
                        help='optimizer to use: Adam, MomentumSGD, RMSprop')
    parser.add_argument('--lrate',
                        default='0.00001',
                        type=float,
                        help='learning rate for Adam, MomentumSGD or RMSprop')
    parser.add_argument('--genpath',
                        type=str,
                        help='path for a pretrained generator')
    parser.add_argument('--clspath',
                        type=str,
                        help='path for a pretrained classifier')
    parser.add_argument('--advdispath',
                        type=str,
                        help='path for a pretrained real/fake discriminator')

    args = parser.parse_args()
    epsi = sys.float_info.epsilon

    output_file = args.output_file
    output_dir = output_file.with_suffix("")
    output_dir.mkdir(exist_ok=True, parents=True)

    all_source = np.load(args.train_data)
    Speakers, SpeakerIndividualKeys = separate_speaker(np.load(
        args.speaker_id))
    NormalizedAllData = get_separated_values(all_source, SpeakerIndividualKeys)
    SpeakerNum = len(Speakers)

    # Set input directories
    EpochNum = args.epoch
    BatchSize = args.batchsize

    SentenceNum = [len(SpeakerIndividualKeys[s]) for s in range(SpeakerNum)]
    MaxSentenceNum = max(SentenceNum)

    print('#GPU: {}'.format(args.gpu))
    print('#epoch: {}'.format(EpochNum))
    print('Optimizer: {}'.format(args.optimizer))
    print('Learning rate: {}'.format(args.lrate))
    print('Snapshot: {}'.format(args.snapshot))

    # Set up model
    num_mels = 36
    zdim = 5
    hdim = 32
    cdim = 8
    adim = 32

    # num_mels = data.shape[0] (36dim)
    # zdim = 8
    # hdim = 32
    generator_class = net.Generator_new
    classifier_class = net.Classifier1
    discriminator_class = net.AdvDiscriminator1
    loss_class = net.Loss_new

    generator = generator_class(SpeakerNum)
    paranum = sum(p.data.size for p in generator.params())
    print('Parameter #: {}'.format(paranum))

    # cdim = 8
    classifier = classifier_class(num_mels, SpeakerNum, cdim)
    paranum = sum(p.data.size for p in classifier.params())
    print('Parameter #: {}'.format(paranum))

    # adim = 32
    adverserial_discriminator = discriminator_class(num_mels, SpeakerNum, adim)
    # adverserial_discriminator = net.AdvDiscriminator_noactive(num_mels, SpeakerNum, adim)
    paranum = sum(p.data.size for p in adverserial_discriminator.params())
    print('Parameter #: {}'.format(paranum))

    if args.genpath is not None:
        try:
            serializers.load_npz(args.genpath, generator)
        except:
            print('Could not load generator.')
    if args.clspath is not None:
        try:
            serializers.load_npz(args.clspath, classifier)
        except:
            print('Could not load domain classifier.')
    if args.advdispath is not None:
        try:
            serializers.load_npz(args.advdispath, adverserial_discriminator)
        except:
            print('Could not load real/fake discriminator.')

    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        generator.to_gpu()
        classifier.to_gpu()
        adverserial_discriminator.to_gpu()
    xp = np if args.gpu < 0 else cuda.cupy

    # Set up optimziers
    # loss = net.Loss1(generator, classifier, adverserial_discriminator)
    loss = loss_class(generator, classifier, adverserial_discriminator)
    # w_adv = 1.0
    # w_cls = 1.0
    # w_cyc = 1.0
    # w_rec = 1.0
    w_adv = 1.0
    w_cls = 1.0
    w_cyc = 1.0
    w_rec = 1.0
    if args.optimizer == 'MomentumSGD':
        opt_gen = optimizers.MomentumSGD(lr=args.lrate, momentum=0.9)
        opt_cls = optimizers.MomentumSGD(lr=args.lrate, momentum=0.9)
        opt_advdis = optimizers.MomentumSGD(lr=args.lrate, momentum=0.9)
    elif args.optimizer == 'Adam':
        opt_gen = optimizers.Adam(alpha=0.001, beta1=0.9)
        opt_cls = optimizers.Adam(alpha=0.00005, beta1=0.5)
        opt_advdis = optimizers.Adam(alpha=0.00001, beta1=0.5)
    elif args.optimizer == 'RMSprop':
        opt_gen = optimizers.RMSprop(lr=args.lrate)
        opt_cls = optimizers.RMSprop(lr=args.lrate)
        opt_advdis = optimizers.RMSprop(lr=args.lrate)
    opt_gen.setup(generator)
    opt_cls.setup(classifier)
    opt_advdis.setup(adverserial_discriminator)

    AllCombinationPairs = list(itertools.combinations(range(SpeakerNum), 2))
    # train
    for epoch in trange(args.epoch_start, EpochNum + 1):

        # shuffled_indexes[speaker_idx][idx]: value is index of NormalizedAllData[speaker_idx][**here**]
        shuffled_indexes = [
            myperm(SentenceNum[s], MaxSentenceNum) for s in range(SpeakerNum)
        ]

        for n in range(MaxSentenceNum // BatchSize):
            # batchlist_mcep[speaker_idx][sentence_idx_in_batch]
            batchlist_mcep = []
            begin_idx = n * BatchSize
            end_idx = begin_idx + BatchSize  # not include @ end_idx
            for s in range(SpeakerNum):
                batch_tmp = []
                for idx in shuffled_indexes[s][begin_idx:end_idx]:
                    batch_tmp.append(
                        NormalizedAllData[s][idx].T)  # Transpose here!!
                batchlist_mcep.append(batch_tmp)
            # Convert batchlist into a list of arrays
            X = [batchlist2array(batchlist) for batchlist in batchlist_mcep]

            xin = [
                chainer.Variable(xp.asarray(Xs, dtype=np.float32)) for Xs in X
            ]

            # Iterate through all speaker pairs
            random.shuffle(AllCombinationPairs)
            for s0, s1 in AllCombinationPairs:
                AdvLoss_d, AdvLoss_g, ClsLoss_r, ClsLoss_f, CycLoss, RecLoss \
                    = loss.calc_loss(xin[s0], xin[s1], s0, s1, SpeakerNum)
                gen_loss = (w_adv * AdvLoss_g + w_cls * ClsLoss_f +
                            w_cyc * CycLoss + w_rec * RecLoss)
                cls_loss = ClsLoss_r
                advdis_loss = AdvLoss_d
                generator.cleargrads()
                gen_loss.backward()
                opt_gen.update()
                classifier.cleargrads()
                cls_loss.backward()
                opt_cls.update()
                adverserial_discriminator.cleargrads()
                advdis_loss.backward()
                opt_advdis.update()

            print('epoch {}, mini-batch {}:'.format(epoch, n + 1))
            print('AdvLoss_d={}, AdvLoss_g={}, ClsLoss_r={}, ClsLoss_f={}'.
                  format(AdvLoss_d.data, AdvLoss_g.data, ClsLoss_r.data,
                         ClsLoss_f.data))
            print('CycLoss={}, RecLoss={}'.format(CycLoss.data, RecLoss.data))
        save_loss(output_dir, AdvLoss_d.data, AdvLoss_g.data, ClsLoss_r.data,
                  ClsLoss_f.data, CycLoss.data, RecLoss.data)

        if epoch % args.snapshot == 0:
            snapshot_dir = output_dir / "snapshot"
            snapshot_dir.mkdir(exist_ok=True)
            snapshot(snapshot_dir, epoch, generator, classifier,
                     adverserial_discriminator)
            snapshot_feature_dir = output_dir / "snapshot_feature"
            snapshot_feature_dir.mkdir(exist_ok=True)
            output = {}
            with chainer.no_backprop_mode():
                identity = np.identity(SpeakerNum)
                for s in range(SpeakerNum):
                    speaker_vec = chainer.Variable(
                        xp.asarray(identity[s], dtype=np.float32))
                    for key, mcep in zip(SpeakerIndividualKeys[s],
                                         NormalizedAllData[s]):
                        mcep_T = mcep.T
                        out = generator.hidden_layer(
                            chainer.Variable(
                                xp.asarray(mcep_T[np.newaxis, :, :],
                                           dtype=np.float32)), speaker_vec)
                        out = np.squeeze(cuda.to_cpu(out.data))
                        output[key] = out.T
            np.savez(
                snapshot_feature_dir /
                f"{output_file.stem}_epoch_{epoch:05}.npz", **output)

    # output final result
    output = {}
    with chainer.no_backprop_mode():
        identity = np.identity(SpeakerNum)
        for s in range(SpeakerNum):
            speaker_vec = chainer.Variable(
                xp.asarray(identity[s], dtype=np.float32))
            for key, mcep in zip(SpeakerIndividualKeys[s],
                                 NormalizedAllData[s]):
                mcep_T = mcep.T
                out = generator.hidden_layer(
                    chainer.Variable(
                        xp.asarray(mcep_T[np.newaxis, :, :],
                                   dtype=np.float32)), speaker_vec)
                out = np.squeeze(cuda.to_cpu(out.data))
                output[key] = out.T
    np.savez(output_file, **output)
Ejemplo n.º 5
0
parser.add_argument("--output_prefix", type=Path)

parser.add_argument("--key_of_pickuped_sentences", nargs="+", required=True)

parser.add_argument("--mode", choices=["ML", "RND"], default="ML")

args = parser.parse_args()

speakers, spkind_keys = separate_speaker(np.load(args.speaker_id))
speaker_num = len(speakers)

target_idx = speakers.index(args.target_speaker)
phn = np.load(args.phn)
phn_N = int(max(map(np.max, phn.values()))) + 1
gold_transcription = get_separated_values(phn, spkind_keys)[target_idx]
src_f0 = get_separated_values(np.load(args.f0), spkind_keys)[target_idx]
src_ap = get_separated_values(np.load(args.ap), spkind_keys)[target_idx]
src_mcep = get_separated_values(np.load(args.mcep), spkind_keys)[target_idx]

ap_generator = AP_generator(phn_N,
                            src_ap,
                            letter_stateseq=gold_transcription,
                            flat=args.flat_ap,
                            mode=args.mode)
f0_generator = F0_generator(phn_N,
                            src_f0,
                            letter_stateseq=gold_transcription,
                            flat=args.flat_f0,
                            mode=args.mode)
mcep_generator = MCEP_generator(phn_N,