Exemple #1
0
def load_config(config_file):
    with codecs.open(config_file, encoding='utf-8') as input_stream:
        params = yaml.load(input_stream)
    hparams = HParams(**params)
    return hparams
hparams = HParams(
    # Comma-separated list of cleaners to run on text prior to training and eval. For non-English
    # text, you may want to use "basic_cleaners" or "transliteration_cleaners".
    cleaners="basic_cleaners",

    # If you only have 1 GPU or want to use only one GPU, please set num_gpus=0 and specify the
    # GPU idx on run. example:
    # expample 1 GPU of index 2 (train on "/gpu2" only): CUDA_VISIBLE_DEVICES=2 python train.py
    # --model="Tacotron" --hparams="tacotron_gpu_start_idx=2"
    # If you want to train on multiple GPUs, simply specify the number of GPUs available,
    # and the idx of the first GPU to use. example:
    # example 4 GPUs starting from index 0 (train on "/gpu0"->"/gpu3"): python train.py
    # --model="Tacotron" --hparams="tacotron_num_gpus=4, tacotron_gpu_start_idx=0"
    # The hparams arguments can be directly modified on this hparams.py file instead of being
    # specified on run if preferred!

    # If one wants to train both Tacotron and WaveNet in parallel (provided WaveNet will be
    # trained on True mel spectrograms), one needs to specify different GPU idxes.
    # example Tacotron+WaveNet on a machine with 4 or plus GPUs. Two GPUs for each model:
    # CUDA_VISIBLE_DEVICES=0,1 python train.py --model="Tacotron"
    # --hparams="tacotron_gpu_start_idx=0, tacotron_num_gpus=2"
    # Cuda_VISIBLE_DEVICES=2,3 python train.py --model="WaveNet"
    # --hparams="wavenet_gpu_start_idx=2; wavenet_num_gpus=2"

    # IMPORTANT NOTE: If using N GPUs, please multiply the tacotron_batch_size by N below in the
    # hparams! (tacotron_batch_size = 32 * N)
    # Never use lower batch size than 32 on a single GPU!
    # Same applies for Wavenet: wavenet_batch_size = 8 * N (wavenet_batch_size can be smaller than
    #  8 if GPU is having OOM, minimum 2)
    # Please also apply the synthesis batch size modification likewise. (if N GPUs are used for
    # synthesis, minimal batch size must be N, minimum of 1 sample per GPU)
    # We did not add an automatic multi-GPU batch size computation to avoid confusion in the
    # user"s mind and to provide more control to the user for
    # resources related decisions.

    # Acknowledgement:
    #	Many thanks to @MlWoo for his awesome work on multi-GPU Tacotron which showed to work a
    # little faster than the original
    #	pipeline for a single GPU as well. Great work!

    # Hardware setup: Default supposes user has only one GPU: "/gpu:0" (Tacotron only for now!
    # WaveNet does not support multi GPU yet, WIP)
    # Synthesis also uses the following hardware parameters for multi-GPU parallel synthesis.
    tacotron_gpu_start_idx=
    0,  # idx of the first GPU to be used for Tacotron training.
    tacotron_num_gpus=
    1,  # Determines the number of gpus in use for Tacotron training.
    split_on_cpu=True,
    # Determines whether to split data on CPU or on first GPU. This is automatically True when
    # more than 1 GPU is used.
    ###########################################################################################################################################

    # Audio
    # Audio parameters are the most important parameters to tune when using this work on your
    # personal data. Below are the beginner steps to adapt
    # this work to your personal data:
    #	1- Determine my data sample rate: First you need to determine your audio sample_rate (how
    # many samples are in a second of audio). This can be done using sox: "sox --i <filename>"
    #		(For this small tuto, I will consider 24kHz (24000 Hz), and defaults are 22050Hz,
    # so there are plenty of examples to refer to)
    #	2- set sample_rate parameter to your data correct sample rate
    #	3- Fix win_size and and hop_size accordingly: (Supposing you will follow our advice: 50ms
    # window_size, and 12.5ms frame_shift(hop_size))
    #		a- win_size = 0.05 * sample_rate. In the tuto example, 0.05 * 24000 = 1200
    #		b- hop_size = 0.25 * win_size. Also equal to 0.0125 * sample_rate. In the tuto
    # example, 0.25 * 1200 = 0.0125 * 24000 = 300 (Can set frame_shift_ms=12.5 instead)
    #	4- Fix n_fft, num_freq and upsample_scales parameters accordingly.
    #		a- n_fft can be either equal to win_size or the first power of 2 that comes after
    # win_size. I usually recommend using the latter
    #			to be more consistent with signal processing friends. No big difference to be seen
    #  however. For the tuto example: n_fft = 2048 = 2**11
    #		b- num_freq = (n_fft / 2) + 1. For the tuto example: num_freq = 2048 / 2 + 1 = 1024 +
    # 1 = 1025.
    #		c- For WaveNet, upsample_scales products must be equal to hop_size. For the tuto
    # example: upsample_scales=[15, 20] where 15 * 20 = 300
    #			it is also possible to use upsample_scales=[3, 4, 5, 5] instead. One must only
    # keep in mind that upsample_kernel_size[0] = 2*upsample_scales[0]
    #			so the training segments should be long enough (2.8~3x upsample_scales[0] *
    # hop_size or longer) so that the first kernel size can see the middle
    #			of the samples efficiently. The length of WaveNet training segments is under the
    # parameter "max_time_steps".
    #	5- Finally comes the silence trimming. This very much data dependent, so I suggest trying
    # preprocessing (or part of it, ctrl-C to stop), then use the
    #		.ipynb provided in the repo to listen to some inverted mel/linear spectrograms. That
    # will first give you some idea about your above parameters, and
    #		it will also give you an idea about trimming. If silences persist, try reducing
    # trim_top_db slowly. If samples are trimmed mid words, try increasing it.
    #	6- If audio quality is too metallic or fragmented (or if linear spectrogram plots are
    # showing black silent regions on top), then restart from step 2.
    num_mels=
    80,  # Number of mel-spectrogram channels and local conditioning dimensionality
    #  network
    rescale=True,  # Whether to rescale audio prior to preprocessing
    rescaling_max=0.9,  # Rescaling value
    # Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
    # train samples of lengths between 3sec and 14sec are more than enough to make a model capable
    # of good parallelization.
    clip_mels_length=True,
    # For cases of OOM (Not really recommended, only use if facing unsolvable OOM errors,
    # also consider clipping your samples to smaller chunks)
    max_mel_frames=900,
    # Only relevant when clip_mels_length = True, please only use after trying output_per_steps=3
    #  and still getting OOM errors.

    # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
    # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
    # Does not work if n_ffit is not multiple of hop_size!!
    use_lws=False,
    # Only used to set as True if using WaveNet, no difference in performance is observed in
    # either cases.
    silence_threshold=
    2,  # silence threshold used for sound trimming for wavenet preprocessing

    # Mel spectrogram
    n_fft=
    800,  # Extra window size is filled with 0 paddings to match this parameter
    hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
    win_size=
    800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
    sample_rate=
    16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
    frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)

    # M-AILABS (and other datasets) trim params (these parameters are usually correct for any
    # data, but definitely must be tuned for specific speakers)
    trim_fft_size=512,
    trim_hop_size=128,
    trim_top_db=23,

    # Mel and Linear spectrograms normalization/scaling and clipping
    signal_normalization=True,
    # Whether to normalize mel spectrograms to some predefined range (following below parameters)
    allow_clipping_in_normalization=
    True,  # Only relevant if mel_normalization = True
    symmetric_mels=True,
    # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
    # faster and cleaner convergence)
    max_abs_value=4.,
    # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
    # be too big to avoid gradient explosion,
    # not too small for fast convergence)
    normalize_for_wavenet=True,
    # whether to rescale to [0, 1] for wavenet. (better audio quality)
    clip_for_wavenet=True,
    # whether to clip [-max, max] before training/synthesizing with wavenet (better audio quality)

    # Contribution by @begeekmyfriend
    # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
    # levels. Also allows for better G&L phase reconstruction)
    preemphasize=True,  # whether to apply filter
    preemphasis=0.97,  # filter coefficient.

    # Limits
    min_level_db=-100,
    ref_level_db=20,
    fmin=55,
    # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
    # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
    fmax=7600,  # To be increased/reduced depending on data.

    # Griffin Lim
    power=1.5,
    # Only used in G&L inversion, usually values between 1.2 and 1.5 are a good choice.
    griffin_lim_iters=60,
    # Number of G&L iterations, typically 30 is enough but we use 60 to ensure convergence.
    ###########################################################################################################################################

    # Tacotron
    outputs_per_step=1,  # Was 1
    # number of frames to generate at each decoding step (increase to speed up computation and
    # allows for higher batch size, decreases G&L audio quality)
    stop_at_any=True,
    # Determines whether the decoder should stop when predicting <stop> to any frame or to all of
    # them (True works pretty well)
    embedding_dim=
    512,  # dimension of embedding space (these are NOT the speaker embeddings)

    # Encoder parameters
    enc_conv_num_layers=3,  # number of encoder convolutional layers
    enc_conv_kernel_size=(
        5, ),  # size of encoder convolution filters for each layer
    enc_conv_channels=
    512,  # number of encoder convolutions filters for each layer
    encoder_lstm_units=
    256,  # number of lstm units for each direction (forward and backward)

    # Attention mechanism
    smoothing=False,  # Whether to smooth the attention normalization function
    attention_dim=128,  # dimension of attention space
    attention_filters=8,  # was 32  # number of attention convolution filters
    attention_kernel=21,  # was (31,)  # kernel size of attention convolution
    cumulative_weights=True,
    # Whether to cumulate (sum) all previous attention weights or simply feed previous weights (
    # Recommended: True)

    # (shiyao): DCA attention
    attention_n_taps=11,
    prior_alpha=0.1,  #0.5,
    prior_speed=
    1.0,  #3.6,   # measured with a linear-regression on training-data

    # Decoder
    prenet_layers=[256, 256],  # number of layers and number of units of prenet
    decoder_layers=2,  # number of decoder lstm layers
    decoder_lstm_units=1024,  # number of decoder lstm units on each layer
    max_iters=2000,
    # Max decoder steps during inference (Just for safety from infinite loop cases)

    # Residual postnet
    postnet_num_layers=3,  # number of postnet convolutional layers
    postnet_kernel_size=(
        5, ),  # size of postnet convolution filters for each layer
    postnet_channels=512,  # number of postnet convolution filters for each layer

    # CBHG mel->linear postnet
    cbhg_kernels=8,
    # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act
    #  as "K-grams"
    cbhg_conv_channels=128,  # Channels of the convolution bank
    cbhg_pool_size=2,  # pooling size of the CBHG
    cbhg_projection=256,
    # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
    cbhg_projection_kernel_size=3,  # kernel_size of the CBHG projections
    cbhg_highwaynet_layers=4,  # Number of HighwayNet layers
    cbhg_highway_units=
    128,  # Number of units used in HighwayNet fully connected layers
    cbhg_rnn_units=128,
    # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in
    # shape

    # Loss params
    mask_encoder=True,
    # whether to mask encoder padding while computing attention. Set to True for better prosody
    # but slower convergence.
    mask_decoder=False,
    # Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not
    #  be weighted, else recommended pos_weight = 20)
    cross_entropy_pos_weight=20,
    # Use class weights to reduce the stop token classes imbalance (by adding more penalty on
    # False Negatives (FN)) (1 = disabled)
    predict_linear=False,
    # Whether to add a post-processing network to the Tacotron to predict linear spectrograms (
    # True mode Not tested!!)
    ###########################################################################################################################################

    # Tacotron Training
    # Reproduction seeds
    tacotron_random_seed=5339,
    # Determines initial graph and operations (i.e: model) random state for reproducibility
    tacotron_data_random_state=
    1234,  # random state for train test split repeatability

    # performance parameters
    tacotron_swap_with_cpu=False,
    # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause
    # major slowdowns! Only use when critical!)

    # train/test split ratios, mini-batches sizes
    tacotron_batch_size=
    20,  # number of training samples on each training steps (was 32)
    # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during
    # testing).
    # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times
    #  different from training. We thus recommend masking the encoder.
    tacotron_synthesis_batch_size=20,
    # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN"T TRAIN TACOTRON WITH "mask_encoder=True"!!
    tacotron_test_size=0.05,
    # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is
    # enough to have a good idea about overfit)
    tacotron_test_batches=None,  # number of test batches.

    # Learning rate schedule
    tacotron_decay_learning_rate=True,
    # boolean, determines if the learning rate will follow an exponential decay
    tacotron_start_decay=50000,  # Step at which learning decay starts
    tacotron_decay_steps=
    50000,  # Determines the learning rate decay slope (UNDER TEST)
    tacotron_decay_rate=0.4,  # learning rate decay rate (UNDER TEST)
    tacotron_initial_learning_rate=1e-3,  # starting learning rate
    tacotron_final_learning_rate=1e-5,  # minimal learning rate

    # Optimization parameters
    tacotron_adam_beta1=0.9,  # AdamOptimizer beta1 parameter
    tacotron_adam_beta2=0.999,  # AdamOptimizer beta2 parameter
    tacotron_adam_epsilon=1e-6,  # AdamOptimizer Epsilon parameter

    # Regularization parameters
    tacotron_reg_weight=1e-7,  # regularization weight (for L2 regularization)
    tacotron_scale_regularization=False,
    # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is
    #  high and biasing the model)
    tacotron_zoneout_rate=0.1,  # zoneout rate for all LSTM cells in the network
    tacotron_dropout_rate=
    0.5,  # dropout rate for all convolutional layers + prenet
    tacotron_clip_gradients=True,  # whether to clip gradients

    # Evaluation parameters
    natural_eval=False,
    # Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same
    #  teacher-forcing ratio as in training (just for overfit)

    # Decoder RNN learning can take be done in one of two ways:
    #	Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode="constant"
    #	Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is
    # function of global step. (teacher forcing ratio decay) mode="scheduled"
    # The second approach is inspired by:
    # Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks.
    # Can be found under: https://arxiv.org/pdf/1506.03099.pdf
    tacotron_teacher_forcing_mode="constant",
    # Can be ("constant" or "scheduled"). "scheduled" mode applies a cosine teacher forcing ratio
    # decay. (Preference: scheduled)
    tacotron_teacher_forcing_ratio=1.,
    # Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder
    # inputs, Only relevant if mode="constant"
    tacotron_teacher_forcing_init_ratio=1.,
    # initial teacher forcing ratio. Relevant if mode="scheduled"
    tacotron_teacher_forcing_final_ratio=0.,
    # final teacher forcing ratio. Relevant if mode="scheduled"
    tacotron_teacher_forcing_start_decay=10000,
    # starting point of teacher forcing ratio decay. Relevant if mode="scheduled"
    tacotron_teacher_forcing_decay_steps=280000,
    # Determines the teacher forcing ratio decay slope. Relevant if mode="scheduled"
    tacotron_teacher_forcing_decay_alpha=0.,
    # teacher forcing ratio decay rate. Relevant if mode="scheduled"
    ###########################################################################################################################################

    # Tacotron-2 integration parameters
    train_with_GTA=False,
    # Whether to use GTA mels to train WaveNet instead of ground truth mels.
    ###########################################################################################################################################

    # Eval sentences (if no eval text file was specified during synthesis, these sentences are
    # used for eval)
    sentences=[
        # From July 8, 2017 New York Times:
        "Scientists at the CERN laboratory say they have discovered a new particle.",
        "There\"s a way to measure the acute emotional intelligence that has never gone out of "
        "style.",
        "President Trump met with other leaders at the Group of 20 conference.",
        "The Senate\"s bill to repeal and replace the Affordable Care Act is now imperiled.",
        # From Google"s Tacotron example page:
        "Generative adversarial network or variational auto-encoder.",
        "Basilar membrane and otolaryngology are not auto-correlations.",
        "He has read the whole thing.",
        "He reads books.",
        "He thought it was time to present the present.",
        "Thisss isrealy awhsome.",
        "Punctuation sensitivity, is working.",
        "Punctuation sensitivity is working.",
        "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
        "She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
        "Tajima Airport serves Toyooka.",
        # From The web (random long utterance)
        "Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
        This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
        the adopted architecture is able to perform this task with wild success.",
        "Thank you so much for your support!",
    ],

    ### SV2TTS ###
    speaker_embedding_size=256,
    silence_min_duration_split=
    0.4,  # Duration in seconds of a silence for an utterance to be split
    utterance_min_duration=
    1.6,  # Duration in seconds below which utterances are discarded
)
Exemple #3
0
def main(model_dir, train_data, eval_data, vocab_file, hparams):
    tf.logging.set_verbosity(tf.logging.INFO)

    hparams_ = HParams(num_epochs=10,
                       batch_size=16,
                       max_steps=10000,
                       units=150,
                       layers=3,
                       dropout=0.0,
                       question_max_words=30,
                       passage_max_words=150,
                       predict_passage_max_words=800,
                       answer_max_words=50,
                       vocab_size=30000,
                       emb_size=300,
                       r=0.8,
                       cudnn=False,
                       grad_clip=5.0,
                       tgt_sos_id=1,
                       tgt_eos_id=2,
                       word_vocab_file=vocab_file)
    hparams_.parse(hparams)
    hparams = hparams_

    config = tf.ConfigProto()
    # config.intra_op_parallelism_threads = 32
    # config.inter_op_parallelism_threads = 32

    run_config = tf.estimator.RunConfig(log_step_count_steps=1,
                                        tf_random_seed=19830610,
                                        model_dir=model_dir,
                                        save_summary_steps=1,
                                        session_config=config)

    with tf.Session() as sess:
        test = input_fn([train_data],
                        hparams=hparams,
                        mode=tf.estimator.ModeKeys.EVAL,
                        batch_size=hparams.batch_size)

        print(sess.run([test]))

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       params=hparams,
                                       config=run_config)

    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: input_fn([train_data],
                                  hparams=hparams,
                                  mode=tf.estimator.ModeKeys.TRAIN,
                                  num_epochs=hparams.num_epochs,
                                  batch_size=hparams.batch_size),
        max_steps=hparams.max_steps,
        hooks=None)

    eval_spec = tf.estimator.EvalSpec(
        input_fn=lambda: input_fn([eval_data],
                                  hparams=hparams,
                                  mode=tf.estimator.ModeKeys.EVAL,
                                  batch_size=hparams.batch_size),
        exporters=[
            tf.estimator.LatestExporter(
                name=
                "predict",  # the name of the folder in which the model will be exported to under export
                serving_input_receiver_fn=partial(serving_input_fn,
                                                  params=hparams),
                exports_to_keep=1,
                as_text=True)
        ],
        steps=10,
        throttle_secs=1200)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    if not os.path.exists(name_dir):
        os.makedirs(name_dir)

    metadata = load_metadata(args.path)
    if args.robot:
        metadata = metadata[metadata['robot'] == args.robot]
    if args.filter_primitive:
        metadata = metadata[metadata['primitives'] == args.filter_primitive]

    ncam = min(metadata['ncam'].frame.unique().tolist())
    print('loaded {} records with robot={} and primitive={}'.format(
        len(metadata), args.robot, args.filter_primitive))

    hparams = HParams(**default_loader_hparams())
    hparams.target_adim = args.target_adim
    hparams.target_sdim = args.target_sdim
    hparams.action_mismatch = 3
    hparams.state_mismatch = 3
    hparams.cams_to_load = list(range(ncam))
    hparams.load_T = min(min(metadata['state_T']),
                         min(metadata['img_T'])).frame
    assert len(args.img_dims) == 2, "should be (height, width) tuple"
    hparams.img_size = tuple(args.img_dims)

    print('saving images with adim-{}, sdim-{}, img_dims-{}, T-{}'.format(
        hparams.target_adim, hparams.target_sdim, hparams.img_size,
        hparams.load_T))

    record_metadata = {
    programs = [0, 0, 8, 16, 24, 33, 40, 48, 64, 72, 80, 88, 96, 104, 120]
else:
    print('invalid model name.')
    exit()

trc_len = len(tracks)
trc_idx = sorted(list(range(trc_len)),
                 key=lambda x: 0 if tracks[x] == 'Bass' else 1)
note_size = 84
note_offset = 24
time_note = note_size * trc_len + 1
end_note = note_size * trc_len + 2
hparams = HParams(
    **{
        "n_vocab": end_note + 1,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12
    })

top_p = [int(p) for p in args.top_p.split(',')] if ',' in args.top_p else [
    int(args.top_p), int(args.top_p)
]
temperature = args.temperature
chords = [c for c in args.chord.split('|') if len(c) > 0]
for c in chords:
    if not (c == 'auto' or c[0] in 'ABCDEFG'):
        print('invalid chord name.')
        exit()

Exemple #6
0
args = parser.parse_args()

with open('ja-bpe.txt') as f:
    bpe = f.read().split('\n')

with open('emoji.json') as f:
    emoji = json.loads(f.read())

enc = BPEEncoder_ja(bpe, emoji)
n_vocab = len(enc)

if 'small' in args.model:
    hparams = HParams(
        **{
            "n_vocab": n_vocab,
            "n_ctx": 1024,
            "n_embd": 768,
            "n_head": 12,
            "n_layer": 12
        })
elif 'medium' in args.model:
    hparams = HParams(
        **{
            "n_vocab": n_vocab,
            "n_ctx": 1024,
            "n_embd": 1024,
            "n_head": 16,
            "n_layer": 24
        })
elif 'large' in args.model:
    hparams = HParams(
        **{
Exemple #7
0
parser.add_argument("--quant_method",default=0,type=int)
parser.add_argument("--quant_size",default=128.0,type=float)

parser.add_argument("--batch_size",default=16,type=int)
parser.add_argument("--steps",default=100000,type=int)
parser.add_argument("--test_per_iterations",default=500, type=int)

parser.add_argument("--queue_capacity",default=32, type=int) 

parser.add_argument("--max_alpha",default=50.0,type=float)
parser.add_argument("--alpha_div",default=100000,type=float)

args = parser.parse_args()

hparams = HParams()
hyper_parameters = {
    'model_type': args.model_type,
    
    'train_dataset_path' : args.dataset,
    'test_dataset_path' : args.test_dataset,

    'checkpoint':args.checkpoint,
    'metagraph':args.metagraph,
    
    'in_img_width': args.img_x,
    'in_img_height': args.img_y,
    'channels': args.channels,

    'quant_method': args.quant_method,
    'quant_size': args.quant_size,
Exemple #8
0
def sample_model(model_name='124M',
                 nsamples=1,
                 batch_size=1,
                 length=12,
                 temperature=1,
                 top_k=4,
                 top_p=0,
                 models_dir='models',
                 data_type='fp32'):
    """Run the sample_model.

    :model_name=124M : String, which model to use
    :nsamples=0 : Number of samples to return, if 0, continues to
     generate samples indefinately.
    :batch_size=1 : Number of batches (only affects speed/memory).
    :length=None : Number of tokens in generated text, if None (default), is
     determined by model hyperparameters
    :temperature=1 : Float value controlling randomness in boltzmann
     distribution. Lower temperature results in less random completions. As the
     temperature approaches zero, the model will become deterministic and
     repetitive. Higher temperature results in more random completions.
    :top_k=4 : Integer value controlling diversity. 1 means only 1 word is
     considered for each step (token), resulting in deterministic completions,
     while 40 means 40 words are considered at each step. 0 (default) is a
     special setting meaning no restrictions. 40 generally is a good value.
     :models_dir : path to parent folder containing model subfolders
     (i.e. contains the <model_name> folder)
    """

    models_dir = os.path.expanduser(os.path.expandvars(models_dir))
    enc = encoder.get_encoder(model_name, models_dir)
    hparams = HParams(n_vocab=0, n_ctx=1024, n_embd=768, n_head=12, n_layer=12)

    with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if length is None:
        length = hparams.n_ctx
    elif length > hparams.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         hparams.n_ctx)

    # start_ids has shape [batch_size, start_len].flatten()
    # start_ids = [15496, 11, 616, 3290, 468,
    #             15496, 11, 616, 3290, 469,
    #             15496, 11, 616, 3290, 470,
    #             15496, 11, 616, 3290, 471]
    start_ids = [enc.encoder['<|endoftext|>'] for i in range(batch_size)]

    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph("{}/{}/model.ckpt.meta".format(
            models_dir, model_name))
        print("[INFO] restore the model {}/{}".format(models_dir, model_name))
        saver.restore(sess,
                      ("{}/{}/model.ckpt".format(models_dir, model_name)))

        if data_type == 'fp32':
            tf_data_type = tf.float32
        elif data_type == 'fp16':
            tf_data_type = tf.float16
        else:
            assert (False)

        decoder_args = TransformerArgument(beam_width=1,
                                           head_num=hparams.n_head,
                                           size_per_head=hparams.n_embd //
                                           hparams.n_head,
                                           num_layer=hparams.n_layer,
                                           dtype=tf_data_type,
                                           kernel_init_range=0.00,
                                           bias_init_range=0.00)

        decoding_args = DecodingGpt2Argument(hparams.n_vocab,
                                             enc.encoder['<|endoftext|>'],
                                             enc.encoder['<|endoftext|>'],
                                             length + 2, decoder_args, top_k,
                                             top_p, temperature)

        ckpt_dict = {}
        for var in tf.trainable_variables():
            ckpt_dict[var.name] = var
        decoding_vars = tf.trainable_variables()

        op_output = ft_gpt2_op(decoding_vars, decoding_args, batch_size,
                               start_ids)

        generated = 0

        while nsamples == 0 or generated < nsamples:
            print("[INFO] FT op time: {}".format(
                time_test(sess, op_output, iterations=5, warmup=True)))
            op_out = sess.run(op_output)

            for i in range(batch_size):
                generated += 1

                text = enc.decode(op_out[i][1:])
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                print(text)
Exemple #9
0
logger = logging.getLogger(__name__)

RANDOM_SEED = 1234567

params = HParams(
    # dataset
    min_playcount=10,
    min_interactions=10,

    # model
    optimizer='adam',
    learning_rate=0.0001,
    embedding_dim=32,
    clip_norm=1.0,
    margin=1.0,
    initialized_std=1.0,
    activate_l2_norm=False,

    # training iter & batch
    n_epochs=20,
    eval_every_n_batches=2000,

    # valid & test eval parameters
    n_users_in_chunk=100,
    n_users_in_validation=3000,
    n_users_in_test=3000,
    n_negatives=1,
    mode='train')

sampler_params = {
    'uniform': {
        'sampler': 'uniform',
Exemple #10
0
        hparams=HParams(
            batch_size=64,  # 64
            z_size=256,
            max_seq_len=256,
            feature_dim=90,

            # seq_vae
            learning_rate=0.001,
            decay_rate=0.9999,  # Learning rate decay per mini batch.
            min_learning_rate=0.00001,  # Minimum learning rate.
            max_beta=0.2,  # beta of the kl_loss
            beta_decay_rate=0.0,
            dropout_keep_prob=0.75,
            grad_clip=1.0,  # gradient clipping.

            # encoder
            free_bits=8,
            enc_rnn_size=[256, 256],
            residual_encoder=True,

            # decoder
            dec_rnn_size=[256, 256],
            compute_rewards_step=16,
            dec_update_rate=0.8,
            rollout_num=1,
            residual_decoder=True,

            # discriminator-rnn
            dis_learning_rate=0.0,
            dis_rnn_size=[256, 256],
            dis_train_freq=5,
        ),
Exemple #11
0
def load_hparams(model_dir, default_params):
    hparams_path = os.path.join(model_dir, HPARAMS_FILE)
    hparams = HParams(default_params)
    assert os.path.exists(hparams_path)
    with open(hparams_path) as f:
        return hparams.parse_json(f.read())
Exemple #12
0
class HParamsCenter(object):
    default_namespace = 'other_hparams'

    def __init__(self,
                 default_preprocessing_hparams=None,
                 default_model_hparams=None,
                 default_training_hparams=None,
                 models_dir="src.models"):
        self.hparams_dict = defaultdict(HParams)
        self.models_dir = models_dir

        # parsed
        args = self._parsed_args()
        self.parsed_hparams = HParams()
        for key, val in args.__dict__.items():
            self.parsed_hparams.add_hparam(key, val)
        self.register_hparams(self.parsed_hparams, 'parsed_hparams')

        # pre-processing
        self.preprocessing_hparams = default_preprocessing_hparams or HParams()
        self.preprocessing_hparams.parse(
            self.parsed_hparams.preprocessing_hparams)
        self.register_hparams(self.preprocessing_hparams,
                              'preprocessing_hparams')

        # model
        self.model_hparams = default_model_hparams or HParams()
        self.model_hparams = merge_hparams(
            self.model_hparams,
            self._fetch_model_specific_hparams(
                self.parsed_hparams.network_class,
                self.parsed_hparams.network_type, self.models_dir))
        self.model_hparams.parse(self.parsed_hparams.model_hparams)
        self.register_hparams(self.model_hparams, 'model_hparams')

        # traning
        self.training_hparams = default_training_hparams or HParams()
        self.training_hparams.parse(self.parsed_hparams.training_hparams)
        self.register_hparams(self.training_hparams, 'training_hparams')

    @staticmethod
    def _fetch_model_specific_hparams(network_class, network_type, models_dir):
        model_hparams = HParams(model_class=None)
        if network_class is not None and network_type is not None:
            model_module_name = 'model_%s' % network_type
            model_class_name = underline_to_camel(model_module_name)
            try:
                src_module = __import__(
                    '%s.%s.%s' %
                    (models_dir, network_class, model_module_name))
                model_class = eval(
                    'src_module.models.%s.%s.%s' %
                    (network_class, model_module_name, model_class_name))
                model_hparams = model_class.get_default_model_parameters()
                model_hparams.add_hparam('model_class',
                                         model_class)  # add model class
            except ImportError:
                print('Fatal Error: no model module: \"src.models.%s.%s\"' %
                      (network_class, model_module_name))
            except AttributeError:
                print(
                    'Fatal Error: probably (1) no model class named as %s.%s, '
                    'or (2) the class no \"get_default_model_parameters()\"' %
                    (network_class, model_module_name))
        return model_hparams

    @staticmethod
    def _parsed_args():
        parser = argparse.ArgumentParser()
        parser.register('type', 'bool', (lambda x: x.lower() in
                                         ("yes", "true", "t", "1")))
        parser.add_argument('--mode', type=str, default='train', help='')
        parser.add_argument('--dataset', type=str, default='none', help='')
        parser.add_argument('--network_class',
                            type=str,
                            default='transformer',
                            help='')
        parser.add_argument('--network_type', type=str, default=None, help='')
        parser.add_argument('--gpu',
                            type=str,
                            default='3',
                            help='selected gpu index')
        parser.add_argument('--gpu_mem',
                            type=float,
                            default=None,
                            help='selected gpu index')
        parser.add_argument('--model_dir_prefix',
                            type=str,
                            default='prefix',
                            help='model dir name prefix')
        parser.add_argument('--machine',
                            type=str,
                            default='none',
                            help='using aws')

        # parsing parameters group
        parser.add_argument('--preprocessing_hparams',
                            type=str,
                            default='',
                            help='')
        parser.add_argument('--model_hparams', type=str, default='', help='')
        parser.add_argument('--training_hparams',
                            type=str,
                            default='',
                            help='')

        parser.set_defaults(shuffle=True)
        return parser.parse_args()

    def register_hparams(self, hparams, name):
        assert isinstance(hparams, HParams)
        assert isinstance(name, str)

        if name in self.hparams_dict:
            self.hparams_dict[name] = merge_hparams(self.hparams_dict[name],
                                                    hparams)
        else:
            self.hparams_dict[name] = hparams

    @property
    def all_hparams(self):
        all_hparams = HParams()
        for name, hp in self.hparams_dict.items():
            all_hparams = merge_hparams(all_hparams, hp)
        return all_hparams

    def __setitem__(self, key, value):
        assert isinstance(key, str)
        # this is added to the default
        # self.hparams_dict[self.default_namespace][key] = value
        key_found = False
        for _, hps in self.hparams_dict.items():
            try:
                if key in hps:
                    key_found = True
            # when tf==1.4.1, directly use "in" will raise TypeError: argument of type 'HParams' is not iterable
            except TypeError:
                if key in hps.values():
                    key_found = True
            if key_found:
                hps.set_hparam(key, value)
                break

        if not key_found:  # not found, set it
            self.hparams_dict[self.default_namespace].add_hparam(key, value)

    def __getitem__(self, item):
        assert isinstance(item, str)

        for name, hp in self.hparams_dict.items():
            try:
                return getattr(hp, item)
            except AttributeError:
                pass
        raise AttributeError('no item named as \'%s\'' % item)

    def __contains__(self, item):
        if isinstance(item, str):
            for key, hps in self.hparams_dict.items():
                try:
                    if item in hps:
                        return True
                # when tf==1.4.1, directly use "in" will raise TypeError: argument of type 'HParams' is not iterable
                except TypeError:
                    if item in hps.values():
                        return True
        return False
Exemple #13
0
 def all_hparams(self):
     all_hparams = HParams()
     for name, hp in self.hparams_dict.items():
         all_hparams = merge_hparams(all_hparams, hp)
     return all_hparams
Exemple #14
0
        wav = wav.astype(np.float32) / np.iinfo(np.int16).max
        fname = os.path.basename(path)
        yield dict(sample=np.string_(fname), wav=wav)


test_input_fn = generator_input_fn(
    x=test_data_generator,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_epochs=1,
    queue_capacity=10 * TEST_BATCH_SIZE,
    num_threads=1,
)

model = create_model(
    config=RunConfig(model_dir=model_dir),
    hparams=HParams(**params),
)
it = model.predict(input_fn=test_input_fn)

# last batch will contain padding, so remove duplicates
submission = dict()
for t in tqdm(it):
    fname, label = t['sample'].decode(), id2name[t['label']]
    submission[fname] = label

with open(os.path.join(model_dir, 'submission.csv'), 'w') as fout:
    fout.write('fname,label\n')
    for fname, label in submission.items():
        fout.write('{},{}\n'.format(fname, label))
from tensorflow.keras.layers import BatchNormalization, AveragePooling2D, Input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# import for showing the confusion matrix
import itertools
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

#Hyper parameters
hparams = HParams(
    n_classes=10,  # number of different classes in dataset
    learning_rate=1e-4,  # fixed learning rate
    train_batch_size=32,  # training batch size
    val_batch_size=32,  # validation batch size
    test_batch_size=32,  # testing batch size
    n_epochs=10,  # number of epochs to train
    input_name='input_1',  # name of the input tensor for first layer of Keras model
    data_dir='/tmp/cifar-data/',  # path to data directory
    checkpoint_dir='/tmp/checkpoints/'  # path to model checkpoint directory
)

#Data preprocessing
# URL for the data-set on the internet.
data_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"

# Width and height of each image.
img_size = 32

# Number of channels in each image, 3 channels: Red, Green, Blue.
num_channels = 3
Exemple #16
0
	return Config(**config_dict)


CONFIG_MAP = dict()

CONFIG_MAP['lc-cat-mel_2bar_big'] = Config(
	model=LCMusicVAE(lstm_models.BidirectionalLstmEncoder(), lstm_models.CategoricalLstmDecoder()),
	hparams=merge_hparams(
		lstm_models.get_default_hparams(),
		HParams(
			batch_size=2,
			max_seq_len=32,  # 2 bars w/ 16 steps per bar
			z_size=512,
			encoded_z_size=8,
			enc_rnn_size=[2048],
			dec_rnn_size=[128, 128],
			free_bits=0,
			max_beta=0.5,
			beta_rate=0.99999,
			sampling_schedule='inverse_sigmoid',
			sampling_rate=1000,
		)),
	note_sequence_augmenter=data.NoteSequenceAugmenter(transpose_range=(-5, 5)),
	data_converter=data.OneHotMelodyConverter(
		valid_programs=data.MEL_PROGRAMS,
		skip_polyphony=False,
		max_bars=100,  # Truncate long melodies before slicing.
		slice_bars=2,
		steps_per_quarter=4),
	train_examples_path=None,
	eval_examples_path=None,
Exemple #17
0
    config_dict.update(update_dict)
    return Config(**config_dict)


CONFIG_MAP = {}

# Melody
CONFIG_MAP['cat-mel_2bar_small'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.CategoricalLstmDecoder()),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=32,  # 2 bars w/ 16 steps per bar
            z_size=256,
            enc_rnn_size=[512],
            dec_rnn_size=[256, 256],
        )),
    note_sequence_augmenter=data.NoteSequenceAugmenter(transpose_range=(-5,
                                                                        5)),
    data_converter=data.OneHotMelodyConverter(
        valid_programs=data.MEL_PROGRAMS,
        skip_polyphony=False,
        max_bars=100,  # Truncate long melodies before slicing.
        slice_bars=2,
        steps_per_quarter=4),
    train_examples_path=None,
    eval_examples_path=None,
)
Exemple #18
0
def main(_):

    # tf.estimator will load/reuse anything found in its model_dir, so
    # we make sure to clear its contents before every training run.
    # For predictions, however, we of course want to load the previously
    # trained model from disk.
    if tf.gfile.Exists(args.model_dir) and not args.predict_only:
        tf.gfile.DeleteRecursively(args.model_dir)
    tf.gfile.MakeDirs(args.model_dir)

    hparams = HParams(**vars(args))

    # We will use the 20 newsgroups dataset to train our model.
    # Note that we won't be using the labels, since our model is simply
    # learning to reconstruct its inputs as its output.
    train_file_path = os.path.join(hparams.data_dir,
                                   '20ng-train-all-terms.txt')

    # Define the path to the file that we'll store our vocabulary in.
    # This file will have the same number of lines as our vocab_size.
    # Each line will contain a single word in our vocabulary, listed in
    # order of decreasing frequency seen in our training data.
    vocab_path = os.path.join(hparams.processed_data_dir, 'vocab.txt')

    # Data preparation: getting vocabulary and saving tfrecords format.
    if not tf.gfile.Exists(vocab_path):
        print('Extracting vocab, labels, and tokenized texts from data.')
        vocab, labels, texts = newsgroups.fit_and_extract(
            train_file_path, hparams.vocab_size)
        print('Saving vocabulary to {}.'.format(vocab_path))
        with open(vocab_path, 'w+') as f:
            f.write('\n'.join(vocab))

        tfrecords_path = os.path.join(hparams.processed_data_dir,
                                      'embed.tfrecords')
        print('Saving tfrecords to {}.'.format(tfrecords_path))
        tfrecords.save_tfrecords(out_path=tfrecords_path,
                                 labels=labels,
                                 texts=texts,
                                 vocab=vocab)
    else:
        print('Reading existing vocabulary from {}.'.format(vocab_path))
        with open(vocab_path) as f:
            vocab = [l.strip() for l in f.readlines()]

    hparams.vocab = vocab
    print('Creating autoencoder.')
    autoencoder = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=hparams.model_dir,
        config=tf.estimator.RunConfig(log_step_count_steps=10000),
        params=hparams)

    if not args.predict_only:
        print('Training autoencoder.')
        autoencoder.train(
            input_fn=lambda: input_fn(hparams.processed_data_dir, hparams),
            steps=1000)

    sample_sentences = [
        'i like dogs', 'i am a test sentence',
        'TensorFlow is a fun library to use'
    ]
    pred_inputs = []
    for sent in sample_sentences:
        token_ids = [
            vocab.index(w) for w in sent.split()[:args.max_seq_len]
            if w in vocab
        ]
        # Pad if necessary.
        if len(token_ids) < args.max_seq_len:
            token_ids.extend([0] * (args.max_seq_len - len(token_ids)))
        pred_inputs.append(token_ids)

    pred_inp_fn = tf.estimator.inputs.numpy_input_fn(
        x={'x': np.asarray(pred_inputs)}, shuffle=False)
    predictions = autoencoder.predict(input_fn=pred_inp_fn)

    print('Sample predictions:')
    for i, prediction in enumerate(predictions):
        clean_prediction = ' '.join(
            [tok.decode() for tok in prediction if tok != b'_UNK'])
        print('\nExpected:', sample_sentences[i], sep='\t')
        print('Actual:  ', clean_prediction, sep='\t')
Exemple #19
0
hparams = HParams(
    num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
    #  network
    rescale=True,  # Whether to rescale audio prior to preprocessing
    rescaling_max=0.9,  # Rescaling value
    # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
    # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
    # Does not work if n_ffit is not multiple of hop_size!!
    use_lws=False,
    n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
    hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
    win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
    sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
    frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
    # Mel and Linear spectrograms normalization/scaling and clipping
    signal_normalization=True,
    # Whether to normalize mel spectrograms to some predefined range (following below parameters)
    allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
    symmetric_mels=True,
    # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
    # faster and cleaner convergence)
    max_abs_value=4.0,
    # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
    # be too big to avoid gradient explosion,
    # not too small for fast convergence)
    # Contribution by @begeekmyfriend
    # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
    # levels. Also allows for better G&L phase reconstruction)
    preemphasize=True,  # whether to apply filter
    preemphasis=0.97,  # filter coefficient.
    # Limits
    min_level_db=-100,
    ref_level_db=20,
    fmin=55,
    # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
    # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
    fmax=7600,  # To be increased/reduced depending on data.
    ###################### Our training parameters #################################
    img_size=96,
    fps=25,
    batch_size=16,
    initial_learning_rate=1e-4,
    nepochs=200000000000000000,  ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs
    num_workers=16,
    checkpoint_interval=3000,
    eval_interval=3000,
    save_optimizer_state=True,
    syncnet_wt=0.0,  # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence.
    syncnet_batch_size=64,
    syncnet_lr=1e-4,
    syncnet_eval_interval=10000,
    syncnet_checkpoint_interval=10000,
    disc_wt=0.07,
    disc_initial_learning_rate=1e-4,
)
Exemple #20
0
"""
Based on https://github.com/openai/gpt-2/blob/master/src/model.py
(just minor style adjustments made).
"""
import numpy as np
import tensorflow as tf
from tensorflow.contrib.training import HParams

HPARAMS = {
    'default':
    HParams(n_vocab=0,
            n_ctx=1024,
            n_embd=256,
            n_head=12,
            n_layer=12,
            threshold=0.6,
            alpha=0.1),
    'small':
    HParams(n_vocab=0,
            n_ctx=256,
            n_embd=256,
            n_head=8,
            n_layer=8,
            threshold=0.6,
            alpha=0.1),
    'tiny':
    HParams(n_vocab=0,
            n_ctx=64,
            n_embd=64,
            n_head=4,
            n_layer=4,
Exemple #21
0
def test_model(tf_model_path, gluon_model):
    # test data
    ctx = mx.cpu()

    seed = 123
    batch_size = 3
    seq_length = 32
    vocab_size = gluon_model._backbone_model._vocab_size
    np.random.seed(seed)
    input_ids = np.random.randint(
        0,
        vocab_size,
        (batch_size, seq_length)
    )

    with open(os.path.join(tf_model_path, 'hparams.json'), 'r') as hf:
        tf_cfg = json.load(hf)
    hparams = HParams(
        n_vocab=tf_cfg['n_vocab'],
        n_ctx=tf_cfg['n_ctx'],
        n_embd=tf_cfg['n_embd'],
        n_head=tf_cfg['n_head'],
        n_layer=tf_cfg['n_layer'],
    )
    tf_start_states = np.zeros((batch_size, hparams.n_layer, 2, hparams.n_head, 0, hparams.n_embd // hparams.n_head))
    gl_start_states = gluon_model.init_states(batch_size, ctx)

    # gluon model
    gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
    gl_logits_1, gl_states = gluon_model(gl_input_ids, gl_start_states, mx.np.array(0, dtype=np.int32, ctx=ctx))
    gl_logits_2, _ = gluon_model(gl_input_ids, gl_states, mx.np.array(seq_length, dtype=np.int32, ctx=ctx))

    # tf model
    with tf.Session(graph=tf.Graph()) as sess:    
        tf.set_random_seed(None)
        tf_context = tf.placeholder(tf.int32, [batch_size, seq_length])
        tf_past = tf.placeholder(tf.float32, [batch_size, hparams.n_layer, 2, hparams.n_head,
                                            None, hparams.n_embd // hparams.n_head])
        tf_lm_output = model.model(hparams=hparams, X=tf_context, past=tf_past, reuse=tf.AUTO_REUSE)
        
        tf_saver = tf.train.Saver()
        tf_ckpt = tf.train.latest_checkpoint(tf_model_path)
        tf_saver.restore(sess, tf_ckpt)
        
        tf_output_1 = sess.run(tf_lm_output, feed_dict={tf_context:input_ids, tf_past:tf_start_states})
        tf_logits_1 = tf_output_1['logits']
        tf_present = tf_output_1['present']
        
        tf_output_2 = sess.run(tf_lm_output, feed_dict={tf_context:input_ids, tf_past:tf_present})
        tf_logits_2 = tf_output_2['logits']

    for j in range(batch_size):
        assert_allclose(
            gl_logits_1[j, :, :].asnumpy(),
            tf_logits_1[j, :, :],
            1E-3,
            1E-3
        )
    for j in range(batch_size):
        assert_allclose(
            gl_logits_2[j, :, :].asnumpy(),
            tf_logits_2[j, :, :],
            1E-3,
            1E-3
        )
Exemple #22
0

CONFIG_MAP = {}

# Melody
CONFIG_MAP['cat-mel_2bar_small'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.CategoricalLstmDecoder()),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=32,  # 2 bars w/ 16 steps per bar
            z_size=256,
            enc_rnn_size=[512],
            dec_rnn_size=[256, 256],
            free_bits=0,
            max_beta=0.2,
            beta_rate=0.99999,
            sampling_schedule='inverse_sigmoid',
            sampling_rate=1000,
        )),
    note_sequence_augmenter=data.NoteSequenceAugmenter(transpose_range=(-5,
                                                                        5)),
    data_converter=data.OneHotMelodyConverter(
        valid_programs=data.MEL_PROGRAMS,
        skip_polyphony=False,
        max_bars=100,  # Truncate long melodies before slicing.
        slice_bars=2,
        steps_per_quarter=4),
    train_examples_path=None,
    eval_examples_path=None,
        def _fitness(learning_rate):
            """
            Hyper-parameters:
            learning_rate:     Learning-rate for the optimizer.
            hidden_dim:  Size of Hidden Dimension
            """

            # Print the hyper-parameters.
            print('learning rate: {0:.1e}'.format(learning_rate))
            print()

            # Dir-name for the TensorBoard log-files.
            log_dir = _log_dir_name(learning_rate, self.model)

            # Create a callback-function for Keras which will be
            # run after each epoch has ended during training.
            # This saves the log-files for TensorBoard.
            # Note that there are complications when histogram_freq=1.
            # It might give strange errors and it also does not properly
            # support Keras data-generators for the validation-set.
            callback_log = TensorBoard(log_dir=log_dir,
                                       histogram_freq=0,
                                       batch_size=32,
                                       write_graph=True,
                                       write_grads=False,
                                       write_images=False)

            model = None
            history = None
            validation_data = None
            # Create the neural network with these hyper-parameters.
            #K.clear_session()
            if self.model == 'toy':

                X = np.random.randint(0, 6, size=(3000, 50))
                Y = np.random.randint(0, 6, size=(3000, 50, 1))

                model = Sequential()
                model.add(Embedding(6, 50, input_length=50))
                model.add(Dense(300, activation='relu'))
                model.add(Dense(6, activation='softmax'))
                model.compile(optimizer='adam',
                              loss='sparse_categorical_crossentropy',
                              metrics=['accuracy'])
                history = model.fit(X,
                                    Y,
                                    epochs=1,
                                    batch_size=1024,
                                    validation_split=0.2,
                                    validation_data=validation_data,
                                    verbose=1,
                                    callbacks=[callback_log] +
                                    self.custom_metrics)
            else:
                if self.model[:4] == "cap2" or self.model[:4] == "vae2":
                    inputs, outputs = None, None
                    datagen, valgen = None, None
                    cap2 = None
                    callbacks = [callback_log]

                    hparams = HParams(
                        learning_rate=learning_rate,
                        hidden_dim=1024,
                        optimizer='adam',
                        dropout=0.5,
                        max_seq_length=self.data_helper.max_caption_len,
                        embed_dim=self.embedding_matrix.shape[-1],
                        num_embeddings=self.embedding_matrix.shape[0],
                        activation='relu',
                        latent_dim=1000)

                    if self.gen == 'train' or self.gen == 'all':
                        data = get_data(self.model, self.data_helper, gen=True)
                        if self.gen == 'all':
                            val_data = get_data(self.model,
                                                self.val_helper,
                                                gen=True)
                        else:
                            val_data = get_data(self.model, self.val_helper)
                    else:
                        data = get_data(self.model, self.data_helper)
                        val_data = get_data(self.model, self.val_helper)

                        # _, X, Y1, Y2 = self.data_helper.cap2cap()
                        # if self.max_samples is not None:
                        #     X, Y1, Y2, = X[:self.max_samples], Y1[:self.max_samples], Y2[:self.max_samples]
                        # Y2 = np.expand_dims(Y2, axis=2)
                        # validation_data=None
                        # inputs = {'encoder_input': X, 'decoder_input': Y1}
                        # outputs = {'decoder_output': Y2}

                    if self.model != 'cap2img':
                        self.custom_metrics[0].validation_data = val_data
                        callbacks += self.custom_metrics
                        # _, X, Y = self.data_helper.cap2resnet()
                        # Y = Y[:,0,:]
                        # inputs = {'encoder_input': X}
                        # outputs = {'projection_output': Y}

                        # _, X, Y1, Y2, Y3 = self.data_helper.cap2all()
                        # #X, Y1, Y2, Y3 = X[:20], Y1[:20], Y2[:20], Y3[:20]
                        # Y2 = np.expand_dims(Y2, axis=2)
                        # Y3 = Y3[:,0,:]
                        # if self.max_samples is not None:
                        #     X, Y1, Y2, Y3 = X[:self.max_samples], Y1[:self.max_samples], Y2[:self.max_samples], Y3[:self.max_samples]
                        # inputs = {'encoder_input': X, 'decoder_input': Y1}
                        # outputs = {'projection_output': Y3, 'decoder_output': Y2}

                    ModelClass = get_model(self.model)
                    model = ModelClass(hparams,
                                       embeddings=self.embedding_matrix)

                    if self.path_load_model is not None:
                        print("Loading model " + self.path_load_model + " ...")
                        model.load_model(self.path_load_model)

                    model.compile(num_gpu=self.gpu)
                    # history = model.fit(inputs,
                    #                 outputs,
                    #                 epochs=3,
                    #                 batch_size=256,
                    #                 validation_split=0.2,
                    #                 validation_data=validation_data,
                    #                 callbacks=callbacks)
                    if model.gpu_model is None:
                        model_to_run = model.model
                    else:
                        model_to_run = model.gpu_model
                    if isinstance(data, keras.utils.Sequence):
                        history = model_to_run.fit_generator(
                            data,
                            epochs=self.epochs,
                            validation_data=val_data,
                            validation_steps=len(val_data),
                            callbacks=callbacks,
                            workers=4,
                            use_multiprocessing=True)
                    elif isinstance(data, tuple):
                        history = model_to_run.fit(x=data[0],
                                                   y=data[1],
                                                   epochs=self.epochs,
                                                   validation_data=val_data,
                                                   callbacks=callbacks,
                                                   batch_size=self.batch_size)

            # Get the classification accuracy on the validation-set
            # after the last training-epoch.
            if self.model != 'cap2img':
                f1 = self.custom_metrics[0].val_f1s[-1]
                print()
                print("Val F1: {0:.2%}".format(f1))
                print()
            else:
                f1 = history.history['val_acc'][-1]
                print()
                print("Val Acc: {0:.2%}".format(f1))
                print()

            # Print the classification accuracy.

            # Save the model if it improves on the best-found performance.
            # We use the global keyword so we update the variable outside
            # of this function.
            # If the classification accuracy of the saved model is improved ...
            print(self.best_f1)
            if f1 > self.best_f1:
                print("saving model at {0}".format(self.path_best_model))
                # Save the new model to harddisk.
                model.model.save(self.path_best_model)
                # Update the classification accuracy.
                self.best_f1 = f1

            # Delete the Keras model with these hyper-parameters from memory.
            del model

            # Clear the Keras session, otherwise it will keep adding new
            # models to the same TensorFlow graph each time we create
            # a model with a different set of hyper-parameters.
            K.clear_session()

            # NOTE: Scikit-optimize does minimization so it tries to
            # find a set of hyper-parameters with the LOWEST fitness-value.
            # Because we are interested in the HIGHEST classification
            # accuracy, we need to negate this number so it can be minimized.
            return -f1
Exemple #24
0
 def hparams(self, values):
     if not isinstance(self._hparams, HParams):
         self._hparams = HParams()
     for k, v in values.items():
         self._hparams.add_hparam(k, v)
Exemple #25
0
                        help='Number of epochs to start training from')
    parser.add_argument('--epochOffset',
                        dest='epoch_offset',
                        type=int,
                        default=0,
                        help='Offset of epochs to start training from')

    args = parser.parse_args()
    print(args)
    if len(vars(args)) == 0:
        raise Exception('Invalid arguments')

    hparams = HParams(input_size=129,
                      input_len=args.input_len,
                      rnn_layer_size=args.rnn_layer_size,
                      lr=args.lr,
                      num_epochs=args.num_epochs,
                      epoch_offset=args.epoch_offset,
                      epsilon=1.0 / 4,
                      num_note_lr=1)

    if args.mode == 'io':
        # assumes inpath - to a specific file, outpath - folder
        testAllIO(args.inpath, args.outpath)
    elif args.mode == 'mkv':
        # assumes inpath - to folder with files, outpath - outputfile
        testMonteCarlo(args.inpath, args.outpath, order=args.order)
    elif args.mode == 'fmidi':
        collectMIDIFiles(args.inpath, args.outpath, args.suffix)
    elif args.mode == 'trnn':
        testRNNTrain(args.inpath, args.outpath, hparams)
    elif args.mode == 'grnn':
Exemple #26
0
class TunatorLSTM:
    """
    LSTM model for synthesizing polyphonic MIDI.

    When the class is instantiated,
    the `update_datastore` method ensures that all the MIDI data from the
    files in `midi_dir` are stored in the HDF5 at `hdf5_path`. It queries the
    datastore for the song names in `midi_dir` using the `query_datastore`
    method, then updates it with any missing songs. The datastore schema is
    provided in the `update_datastore` method docstring.

    The model is instantiated either via `build_model` or `load_model` and
    trained via `train`. The model can be sampled via `compose`, which will
    output a MIDI file.

    Args:
        midi_dir (str): directory to MIDI files
        hdf5_path (str): path for HDF5 datastore
        hparams (dict): any hyperparameters to be changed from defaults.
            Defaults are shown under `hparams` property. They can also be
            changed dynamically by passing a dict to the `hparams` setter.
    """
    def __init__(self,
                 midi_dir='music/midi/final_fantasy/',
                 hdf5_path='data/songs.hdf5',
                 hparams=None):
        self.midi_dir = midi_dir
        self.hdf5_path = hdf5_path
        self._hparams = hparams

        # set up piano roll
        octaves = 10
        scale = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
        sharps = ['A#', 'C#', 'D#', 'F#', 'G#']
        flats = ['B-', 'D-', 'E-', 'G-', 'A-']
        sharps_scale = sorted(scale + sharps)
        sharps_oct = [note + str(i) for i in range(octaves) for note in sharps]
        flats_oct = [note + str(i) for i in range(octaves) for note in flats]
        flat_sharp_dict = dict(zip(flats_oct, sharps_oct))
        self.piano_roll = [
            note + str(i) for i in range(octaves) for note in sharps_scale
        ]
        self.piano_roll_dict = {
            note: i
            for i, note in enumerate(self.piano_roll)
        }
        self.piano_roll_dict.update({
            flat: self.piano_roll_dict[sharp]
            for flat, sharp in flat_sharp_dict.items()
        })

        # prepare data
        self.song_file_dict = self.get_song_file_dict()
        songs = list(self.song_file_dict)
        random.shuffle(songs)
        split = int(.8 * len(songs))
        self.train_songs = songs[:split]
        self.val_songs = songs[split:]
        self.update_datastore()

        # instantiate tensor generators for lazy evaluation during training
        self.train_tensor_gen = NoteChordOneHotTensorGen(
            self.train_songs, self.hparams.batch_size, self.hparams.timesteps,
            hdf5_path, self.piano_roll_dict, self.n_vocab)
        self.val_tensor_gen = NoteChordOneHotTensorGen(
            self.val_songs, self.hparams.batch_size, self.hparams.timesteps,
            hdf5_path, self.piano_roll_dict, self.n_vocab)

    @property
    def hparams(self):
        defaults = {
            'learning_rate': 0.001,
            'dropout': 0.0,
            'lstm_units': 512,
            'dense_units': 512,
            'batch_size': 32,
            'timesteps': 256,
            'epochs': 3,
        }

        if isinstance(self._hparams, HParams):
            return self._hparams
        elif self._hparams:
            user_entered = self._hparams
        else:
            user_entered = dict()
        combined = dict()
        combined.update(defaults)
        combined.update(user_entered)
        self._hparams = HParams()
        for k, v in combined.items():
            self._hparams.add_hparam(k, v)

        return self._hparams

    @hparams.setter
    def hparams(self, values):
        if not isinstance(self._hparams, HParams):
            self._hparams = HParams()
        for k, v in values.items():
            self._hparams.add_hparam(k, v)

    @property
    def n_vocab(self):
        return len(self.piano_roll)

    @property
    def timestamp(self):
        return datetime.now()

    def build_model(self):
        """

        Returns:

        """
        self.model = Sequential()
        input_shape = (None, self.n_vocab)

        self.model.add(
            CuDNNLSTM(
                self.hparams.lstm_units,
                input_shape=input_shape,
                return_sequences=True,
            ))
        self.model.add(Dropout(self.hparams.dropout))

        self.model.add(
            CuDNNLSTM(self.hparams.lstm_units, return_sequences=True))
        self.model.add(Dropout(self.hparams.dropout))

        self.model.add(
            CuDNNLSTM(self.hparams.lstm_units, return_sequences=True))
        self.model.add(Dropout(self.hparams.dropout))

        self.model.add(TimeDistributed(Dense(self.n_vocab)))
        self.model.add(Dropout(self.hparams.dropout))

        self.model.add(TimeDistributed(Dense(self.n_vocab)))
        self.model.add(Dropout(self.hparams.dropout))

        self.model.add(TimeDistributed(Activation('sigmoid')))
        self.model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    def load_model(self, model_path):
        """

        Args:
            model_path:

        Returns:

        """
        self.build_model()
        self.model.load_weights(model_path)

    def train(self):
        """

        Returns:

        """
        timestamp = datetime.now()
        log_name = f'note-chord-one-hot-songs_{timestamp}'
        tensorboard = TensorBoard(log_dir=f'logs/{log_name}',
                                  histogram_freq=1,
                                  write_graph=True,
                                  write_grads=True,
                                  batch_size=4)  #write_images
        # if adding embeddings, add those parameters
        checkpoint_name = 'weights-improvement-epoch_{epoch:02d}-loss_{loss:.4f}.hdf5'
        checkpoint = ModelCheckpoint(f'checkpoints/{checkpoint_name}',
                                     monitor='loss',
                                     verbose=0,
                                     save_best_only=True,
                                     mode='min')

        val_slice = list(islice(self.val_tensor_gen, 10))
        X_val_list = list()
        Y_val_list = list()
        for item in val_slice:
            X_val_list.append(item[0])
            Y_val_list.append(item[1])
        X_val = np.concatenate(X_val_list, axis=0)
        del X_val_list
        Y_val = np.concatenate(Y_val_list, axis=0)
        del Y_val_list
        val_data = (X_val, Y_val)
        self.model.fit_generator(
            self.train_tensor_gen,
            validation_data=val_data,
            # validation_steps=10,
            steps_per_epoch=self.train_tensor_gen.n_batches,
            epochs=self.hparams.epochs,
            callbacks=[checkpoint, tensorboard])

    def get_song_file_dict(self):
        """
        Creates a lookup dictionary to get filepath from song name.
        Returns:
            song_file_dict (dict): Dictionary with song names as keys and
                filepaths relative to the current directory as values, including
                file extension.
        """
        file_exts = ('*.mid', '*.midi', '*.MID', '*.MIDI')
        song_files = list()
        for ext in file_exts:
            song_files += glob.glob(os.path.join(self.midi_dir, ext))
        get_song_name = lambda x: os.path.splitext(os.path.basename(x))[0]
        song_names = [get_song_name(file) for file in song_files]
        # dict to look up filepath by song name
        song_file_dict = dict(zip(song_names, song_files))
        return song_file_dict

    def query_datastore(self, query, grp_path='songs'):
        """
        Checks datastore at `hdf5_path` for keys specified by `query` within the
        group specified by `grp_path` non-recursively. If the datastore does not
        exist, the query will not raise an error, but instead return all of the
        queried items in `not_found` and none of them in `found`.
        Args:
            query (iterable): keys for which to search within the grp_path in
                the datastore
            grp_path (str): path to the group in which to search for they keys

        Returns:
            found (set): query items that were found in the group
            not_found (set): query items that were not found in the group
        """
        # TODO: what about querying the base group?
        if os.path.isfile(self.hdf5_path):
            grp = h5py.File(self.hdf5_path, 'r')[grp_path]
            keys = list(grp.keys())
        else:
            keys = list()

        found = set(query).intersection(keys)
        not_found = set(query) - set(found)
        return found, not_found

    def update_datastore(self):
        """
        Updates HDF5 datastore with note sequences for any songs in midi_dir
        that are not already present in the datastore.
        """
        def _parse_midi(song):
            """
            The songs are transposed to the key of A. The parser extracts
            the lowest numbered part that has greater than 50 notes. If that
            doesn't work, it flattens all of the parts into a single "flat" part
            which contains all the instruments.
            Args:
                song:

            Returns:

            """
            # TODO: convert output to namedtuples with metadata
            file = self.song_file_dict[song]
            print(f'updating datastore: {file}...')
            midi = m21.converter.parse(file)

            # transpose to A
            transpose_dict = {
                'A#': 11,
                'B-': 11,
                'B': 10,
                'C': 9,
                'C#': 8,
                'D-': 8,
                'D': 7,
                'D#': 6,
                'E-': 6,
                'E': 5,
                'F': 4,
                'F#': 3,
                'G-': 3,
                'G': 2,
                'G#': 1,
                'A-': 1,
                'A': 0,
            }
            key = midi.analyze('key').getTonic().name
            midi = midi.transpose(transpose_dict[key])
            # extract piano, or other
            try:
                midi_parts = m21.instrument.partitionByInstrument(midi).parts
                part = midi_parts[0]
                if not part.partName == 'Piano':
                    pass
                notes_to_parse = part.recurse().notes
                part_i = 0
                while len(notes_to_parse) < 50:
                    part_i += 1
                    part = midi_parts[part_i]
                    notes_to_parse = part.recurse().notes

            except Exception:  # file has notes in a flat structure
                notes_to_parse = midi.flat.chordify().notes

            return notes_to_parse

        def _parse_notes(notes_to_parse):
            """
            Parse MIDI data to a dictionary of timesteps and corresponding
            notes.
            """
            notes = dict()
            for elem in notes_to_parse:
                time = elem.offset

                # TODO: remove after time fix
                if time % 0.5 != 0:
                    continue

                if time not in notes:
                    notes[time] = set()

                if isinstance(elem, m21.note.Note):
                    note_int = self.piano_roll_dict[str(elem.pitch)]
                    notes[time].add(note_int)
                elif isinstance(elem, m21.chord.Chord):
                    note_ints = [
                        self.piano_roll_dict[str(pitch)]
                        for pitch in elem.pitches
                    ]
                    notes[time].update(note_ints)
                else:
                    raise ValueError()

            # TODO: SongMap slicable hashmap class
            # correct fractional indices
            frac_notes = {
                k: v
                for k, v in notes.items() if isinstance(k, fractions.Fraction)
            }
            for k, v in frac_notes.items():
                del notes[k]
                nearest_quarter = round(k * 4) / 4
                if nearest_quarter in notes:
                    notes[nearest_quarter].update(v)
                else:
                    notes[nearest_quarter] = v

            # fill missing time indices
            # temporarily remove because only rests were generated

            time_list = sorted(notes)
            if not time_list:
                raise ValueError()
            end_time = max(time_list)
            min_space = min(
                [j - i for i, j in zip(time_list[:-1], time_list[1:])])
            """
            expected_times = np.array(range(int(end_time / min_space))) * min_space
            missing_times = set(expected_times) - set(time_list)
            if missing_times:
                print(f'filling in {len(missing_times)} missing timepoints in '
                      f'existing {len(notes)}...')
                notes.update({time: set() for time in missing_times})
            """
            # convert to half notes

            # convert notes to a list of strings
            #str_notes = ['.'.join(sorted(notes[k])) for k in sorted(notes)]
            # remove leading and trailing rests
            #for i in (0, -1):
            #    while str_notes and str_notes[i] == '':
            #        str_notes.pop(i)
            # encoding required by h5py
            #str_notes = np.array(str_notes).astype('|S9')

            #vocab = np.array(list(set(str_notes))).astype('|S9')
            return notes, min_space

        def _write_to_datastore(notes, min_space):
            """
            Write a sequence of piano roll integers to HDF5 datastore.
            Args:
                notes (np.array):
                min_space (float):

            Returns:

            """
            notes_list = np.array(
                [np.array(list(notes[k])).astype('i8') for k in sorted(notes)])
            with h5py.File(self.hdf5_path, 'a') as f:
                grp = f.create_group(f'songs/{song}')
                dt = h5py.special_dtype(vlen=np.dtype('int8'))
                dset_notes = grp.create_dataset(name='notes',
                                                shape=(len(notes_list), 1),
                                                data=notes_list,
                                                dtype=dt)
                dset_notes.attrs['spacing'] = min_space

        song_names = set(self.song_file_dict)
        _, missing_songs = self.query_datastore(song_names)

        for song in missing_songs:
            notes_to_parse = _parse_midi(song)
            notes, min_space = _parse_notes(notes_to_parse)
            _write_to_datastore(notes, min_space)

    def compose(self, timesteps):
        """
        Generate MIDI file of length `timesteps` starting from a random seed
        note from a song in the datastore.
        Args:
            timesteps (int): number to timesteps to synthesize

        Returns:

        """
        seed_note = np.array([])
        while seed_note.size == 0:
            with h5py.File(self.hdf5_path) as f:
                grp = f['songs']
                song_names = list(grp.keys())
                song_idx = np.random.randint(0, len(song_names))
                song = grp[song_names[song_idx]]['notes']
                note_idx = np.random.randint(0, len(song))
                seed_note = song[note_idx][0]

        x = np.zeros(self.n_vocab)
        x[seed_note] = 1

        # generate notes
        Y_hat_inds_seq = []
        for i in range(timesteps):
            x = np.expand_dims(x, axis=0)
            x = np.expand_dims(x, axis=0)
            y_hat = self.model.predict(x)
            y_hat_inds = np.argwhere(y_hat > .5).flatten()
            if y_hat_inds.size == 0:
                y_hat_inds = np.argmax(y_hat).flatten()
            Y_hat_inds_seq.append(y_hat_inds)
            x = np.zeros(self.n_vocab)
            x[y_hat_inds] = 1

        ipdb.set_trace()
        rev_piano_roll_dict = {v: k for k, v in self.piano_roll_dict.items()}
        Y_hat_strs = [[rev_piano_roll_dict[ind] for ind in Y_hat_inds]
                      for Y_hat_inds in Y_hat_inds_seq]

        self._output_midi(Y_hat_strs)

        return Y_hat_strs

    def _output_midi(self, Y_hat_strs):
        timesteps = len(Y_hat_strs)
        offset = 0
        output_notes = []

        for event_strs in Y_hat_strs:  # chord
            if len(event_strs) > 1:
                notes = []
                for note_str in event_strs:
                    note = m21.note.Note(int(note_str))
                    m21.note.storedInstrument = m21.instrument.Piano()
                    notes.append(note)
                chord = m21.chord.Chord(notes)
                chord.offset = offset
                output_notes.append(chord)
            elif event_strs:  # note
                note = m21.note.Note(event_strs[0])
                note.offset = offset
                note.storedInstrument = m21.instrument.Piano()
                output_notes.append(note)
            else:  # rest
                rest = m21.note.Rest()
                rest.offset = offset
                rest.storedInstrument = m21.instrument.Piano()
                output_notes.append(rest)

            offset += 1

        midi = m21.stream.Stream(output_notes)
        midi.write('midi', fp=f'test_output-{timesteps}-{self.timestamp}.mid')
Exemple #27
0
hparams = HParams(
    expr='atomic',

    train_batch_size=96,
    eval_batch_size=256,
    max_seq_length=48,  # 128 * 48 for 12GB GPU, 96 * 48 for 8GB; 32 for edge completion
    learning_rate=5e-5,
    lr_decay_step=10000,
    max_lr_decay_rate=0.1,
    adam_epsilon=1e-8,
    gradient_accumulation_steps=1,
    warmup_steps=1000,
    weight_decay=0.0,
    max_grad_norm=1.0,
    dropout_rate=0.3,

    use_type_classifier=False,
    n_edge_types=3,
    load_pretrained=True,
    use_roberta=False,

    sampling_rate=0.5,
    conceptualize_rate=0.5,  # among 1-sampling_rate
    entity_sub_mode='conceptualize',  # conceptualize / random_entity
    concept_score='frequency',  # likelihood / pmi / frequency
    score_weighted=True,
    random_select_mode='instance',  # entity / concept / instance
    n_candidates=40,  # small for conceptualize, large for random_entity
    text_sample_eval=True,
    text_sample_train=False,
    ht_symmetry=False,
    max_entity_word_inc=2,
)
TRAIN_RATIO = 0.7
VALID_RATIO = 0.15
TEST_RATIO = 0.15
IMAGE_SIZE = 128

CATEGORIES_NAME = [
    "Charlock", "Maize", "Black-grass", "Fat Hen", "Loose Silky-bent",
    "Cleavers", "Common Chickweed", "Small-flowered Cranesbill", "Sugar beet",
    "Scentless Mayweed", "Common wheat", "Shepherds Purse"
]

CATEGORIES_NUM = len(CATEGORIES_NAME)

path_params = HParams(data_dir=DATA_DIR,
                      tfrecord_path=TFRECORD_PATH,
                      train_data_path=TRAIN_DATA_PATH,
                      valid_data_path=VALID_DATA_PATH,
                      test_data_path=TEST_DATA_PATH)

train_params = HParams(batch_size=BATCH_SIZE,
                       train_ratio=TRAIN_RATIO,
                       valid_ratio=VALID_RATIO,
                       test_ratio=TEST_RATIO,
                       image_size=IMAGE_SIZE,
                       categories_name=CATEGORIES_NAME,
                       categories_num=CATEGORIES_NUM,
                       seed=8888,
                       epochs=1)

model_params = HParams(image_size=train_params.image_size,
                       learning_rate=0.0001)
 def get_default_hparams(self):
     return HParams(**self.get_default_hparams_dict())
Exemple #30
0
hparams = HParams(
    cleaners="english_cleaners",

    tacotron_gpu_start_idx=0,  # idx of the first GPU to be used for Tacotron training.
    tacotron_num_gpus=1,  # Determines the number of gpus in use for Tacotron training.
    split_on_cpu=True,

    num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality

    rescale=True,  # Whether to rescale audio prior to preprocessing
    rescaling_max=0.9,  # Rescaling value

    clip_mels_length=True,

    max_mel_frames=900,

    use_lws=False,

    silence_threshold=2,  # silence threshold used for sound trimming for wavenet preprocessing
    
    # Mel spectrogram  
    n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
    hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
    win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
    sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
    
    frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
    

    trim_fft_size=512,
    trim_hop_size=128,
    trim_top_db=23,
    

    signal_normalization=True,

    allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
    symmetric_mels=True,

    max_abs_value=4.,

    normalize_for_wavenet=True,

    clip_for_wavenet=True,

    preemphasize=True,  # whether to apply filter
    preemphasis=0.97,  # filter coefficient.
    
    # Limits
    min_level_db=-100,
    ref_level_db=20,
    fmin=55,

    fmax=7600,  # To be increased/reduced depending on data.
    
    # Griffin Lim
    power=1.5,

    griffin_lim_iters=60,

    

    outputs_per_step=2, # Was 1

    stop_at_any=True,

    
    embedding_dim=512,  # dimension of embedding space (these are NOT the speaker embeddings)
    
    # Encoder parameters
    enc_conv_num_layers=3,  # number of encoder convolutional layers
    enc_conv_kernel_size=(5,),  # size of encoder convolution filters for each layer
    enc_conv_channels=512,  # number of encoder convolutions filters for each layer
    encoder_lstm_units=256,  # number of lstm units for each direction (forward and backward)
    

    smoothing=False,  # Whether to smooth the attention normalization function
    attention_dim=128,  # dimension of attention space
    attention_filters=32,  # number of attention convolution filters
    attention_kernel=(31,),  # kernel size of attention convolution
    cumulative_weights=True,

    
    # Decoder
    prenet_layers=[256, 256],  # number of layers and number of units of prenet
    decoder_layers=2,  # number of decoder lstm layers
    decoder_lstm_units=1024,  # number of decoder lstm units on each layer
    max_iters=2000,
    # Max decoder steps during inference (Just for safety from infinite loop cases)
    
    # Residual postnet
    postnet_num_layers=5,  # number of postnet convolutional layers
    postnet_kernel_size=(5,),  # size of postnet convolution filters for each layer
    postnet_channels=512,  # number of postnet convolution filters for each layer
    
    # CBHG mel->linear postnet
    cbhg_kernels=8,
    # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act
    #  as "K-grams"
    cbhg_conv_channels=128,  # Channels of the convolution bank
    cbhg_pool_size=2,  # pooling size of the CBHG
    cbhg_projection=256,
    # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
    cbhg_projection_kernel_size=3,  # kernel_size of the CBHG projections
    cbhg_highwaynet_layers=4,  # Number of HighwayNet layers
    cbhg_highway_units=128,  # Number of units used in HighwayNet fully connected layers
    cbhg_rnn_units=128,
    # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in 
    # shape
    
    # Loss params
    mask_encoder=True,
    # whether to mask encoder padding while computing attention. Set to True for better prosody 
    # but slower convergence.
    mask_decoder=False,
    # Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not
    #  be weighted, else recommended pos_weight = 20)
    cross_entropy_pos_weight=20,
    # Use class weights to reduce the stop token classes imbalance (by adding more penalty on 
    # False Negatives (FN)) (1 = disabled)
    predict_linear=False,
    # Whether to add a post-processing network to the Tacotron to predict linear spectrograms (
	# True mode Not tested!!)
    ###########################################################################################################################################

    # Tacotron Training
    # Reproduction seeds
    tacotron_random_seed=5339,
    # Determines initial graph and operations (i.e: model) random state for reproducibility
    tacotron_data_random_state=1234,  # random state for train test split repeatability
    
    # performance parameters
    tacotron_swap_with_cpu=False,
    # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause 
    # major slowdowns! Only use when critical!)
    
    # train/test split ratios, mini-batches sizes
    tacotron_batch_size=36,  # number of training samples on each training steps (was 32)

    tacotron_synthesis_batch_size=128,
    # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN"T TRAIN TACOTRON WITH "mask_encoder=True"!!
    tacotron_test_size=0.05,

    tacotron_test_batches=None,  # number of test batches.
    

    tacotron_decay_learning_rate=True,
    # boolean, determines if the learning rate will follow an exponential decay
    tacotron_start_decay=50000,  # Step at which learning decay starts
    tacotron_decay_steps=50000,  # Determines the learning rate decay slope (UNDER TEST)
    tacotron_decay_rate=0.5,  # learning rate decay rate (UNDER TEST)
    tacotron_initial_learning_rate=1e-3,  # starting learning rate
    tacotron_final_learning_rate=1e-5,  # minimal learning rate
    
    # Optimization parameters
    tacotron_adam_beta1=0.9,  # AdamOptimizer beta1 parameter
    tacotron_adam_beta2=0.999,  # AdamOptimizer beta2 parameter
    tacotron_adam_epsilon=1e-6,  # AdamOptimizer Epsilon parameter
    
    # Regularization parameters
    tacotron_reg_weight=1e-7,  # regularization weight (for L2 regularization)
    tacotron_scale_regularization=False,
    # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is
    #  high and biasing the model)
    tacotron_zoneout_rate=0.1,  # zoneout rate for all LSTM cells in the network
    tacotron_dropout_rate=0.5,  # dropout rate for all convolutional layers + prenet
    tacotron_clip_gradients=True,  # whether to clip gradients
    
    # Evaluation parameters
    natural_eval=False,

    tacotron_teacher_forcing_mode="constant",

    tacotron_teacher_forcing_ratio=1.,

    tacotron_teacher_forcing_init_ratio=1.,
    # initial teacher forcing ratio. Relevant if mode="scheduled"
    tacotron_teacher_forcing_final_ratio=0.,
    # final teacher forcing ratio. Relevant if mode="scheduled"
    tacotron_teacher_forcing_start_decay=10000,
    # starting point of teacher forcing ratio decay. Relevant if mode="scheduled"
    tacotron_teacher_forcing_decay_steps=280000,
    # Determines the teacher forcing ratio decay slope. Relevant if mode="scheduled"
    tacotron_teacher_forcing_decay_alpha=0.,

 
    # Tacotron-2 integration parameters
    train_with_GTA=False,

    sentences=[
        # From July 8, 2017 New York Times:
        "Scientists at the CERN laboratory say they have discovered a new particle.",
        "There\"s a way to measure the acute emotional intelligence that has never gone out of "
		"style.",
        "President Trump met with other leaders at the Group of 20 conference.",
        "The Senate\"s bill to repeal and replace the Affordable Care Act is now imperiled.",
        # From Google"s Tacotron example page:
        "Generative adversarial network or variational auto-encoder.",
        "Basilar membrane and otolaryngology are not auto-correlations.",
        "He has read the whole thing.",
        "He reads books.",
        "He thought it was time to present the present.",
        "Thisss isrealy awhsome.",
        "Punctuation sensitivity, is working.",
        "Punctuation sensitivity is working.",
        "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
        "She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
        "Tajima Airport serves Toyooka.",
        # From The web (random long utterance)
        "Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
        This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
        the adopted architecture is able to perform this task with wild success.",
        "Thank you so much for your support!",
    ],
    
    
    ### SV2TTS ###
    speaker_embedding_size=256,
    silence_min_duration_split=0.4, # Duration in seconds of a silence for an utterance to be split
    utterance_min_duration=1.6,     # Duration in seconds below which utterances are discarded
    
)