Esempio n. 1
0
    def __init__(self, args):
        self.down_sampling_rates = [16, 16]
        self.kernel_size = 2
        self.args = args
        self.dilation_rates = [2**i for i in range(args.wl + 1)] * 2
        self.receptive_field = self.calc_pad(self.args)
        try:
            directories = self.validate_directories(self.args)
        except ValueError as e:
            print("Some arguments are wrong:")
            print(str(e))
            return

        self.logdir = directories['logdir']

        # Even if we restored the model, we will treat it as new training
        # if the trained model is written into an arbitrary location.

        ### modifying samle size to become square complete
        self.args.sample_size = self.args.sample_size - self.receptive_field // 2
        # Create network.
        self.net_train = SampleTransformer(self.down_sampling_rates,
                                           self.dilation_rates,
                                           self.kernel_size,
                                           self.receptive_field, self.args)
        # self.net_val = SampleTransformer(self.down_sampling_rates, self.dilation_rates, self.kernel_size, self.receptive_field, self.args)
        # Load raw waveform from VCTK corpus.

        with tf.name_scope('create_inputs'):
            # Allow silence trimming to be skipped by specifying a threshold near
            # zero.
            silence_threshold = self.args.silence_threshold if self.args.silence_threshold > \
                                                        EPSILON else None
            gc_enabled = self.args.gc_channels is not None
            self.reader = AudioReader(
                args.data_dir,
                sample_rate=0,
                batch_size=self.args.batch_size,
                gc_enabled=gc_enabled,
                receptive_field=self.
                receptive_field,  # TODO: change receiptive field
                sample_size=self.args.sample_size,
                silence_threshold=silence_threshold)

            self.audio_batch, self.begin = self.reader.get_input_placeholder()

        self.trainData_iter = self.reader.get_data_iterator('train')
        self.valData_iter = self.reader.get_data_iterator('val')

        if args.l2_regularization_strength == 0:
            args.l2_regularization_strength = None

        self.g_step = tf.placeholder(dtype=tf.int32, shape=None, name='step')
        self.lr = tf.placeholder(dtype=tf.float32,
                                 shape=None,
                                 name='learning_rate')
        self.loss_train = self.net_train.loss(
            self.audio_batch,
            self.begin,
            self.g_step,
            True,
            l2_regularization_strength=args.l2_regularization_strength)

        bs.clear_bst_constants()
        params = tf.trainable_variables()
        grads = bs.gradients(self.loss_train, params)
        self.global_norm, self.norm_scale = bs.clip_by_global_norm(
            grads, grad_scale=1.0, clip_norm=1.0)
        adam = bs.AdamOptimizer(learning_rate=self.lr,
                                norm_scale=self.norm_scale,
                                grad_scale=1.0,
                                fp16=False)
        self.train_op = adam.apply_gradients(zip(grads, params))
        self.loss_val = self.net_train.loss(
            self.audio_batch,
            self.begin,
            self.g_step,
            False,
            l2_regularization_strength=args.l2_regularization_strength)
        # Restoring ...
        with tf.variable_scope('memroy', reuse=True):
            memory = tf.get_variable('mem')
        self.sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=False))
        init = tf.global_variables_initializer()
        self.sess.run(init)

        var_list = tf.trainable_variables() + [memory]
        self.saver = tf.train.Saver(var_list=var_list,
                                    max_to_keep=args.max_checkpoints)
        try:
            self.saved_global_step, self.best_val_loss = load(
                self.saver, self.sess, self.logdir, self.args.load_type)
            if self.saved_global_step is None:
                # The first training step will be saved_global_step + 1,
                # therefore we put -1 here for new or overwritten trainings.
                self.saved_global_step = 0
                self.best_val_loss = np.inf
        except:
            print(
                "Something went wrong while restoring checkpoint. "
                "We will terminate training to avoid accidentally overwriting "
                "the previous model.")
            raise
        self.summary_writer = tf.summary.FileWriter(
            os.path.join(self.logdir, STARTED_DATESTRING))
        open_type = 'a' if os.path.exists(self.logdir + '/log.txt') else 'w'
        self.log_file = open(self.logdir + '/log.txt', open_type)
        with open(self.logdir + '/config.txt', open_type) as f:
            f.write(STARTED_DATESTRING + '\n\n')
            for arg in vars(self.args):
                f.write('{}: {}\n'.format(arg, getattr(self.args, arg)))
Esempio n. 2
0
def model(xs, ys, loss_scale=None, train=False):

    with tf.variable_scope("model", reuse=not train):

        with tf.device("/cpu:0"):
            if train:
                grad_scale = tf.reciprocal(loss_scale) if hps.float16 else 1.0
                global_step = tf.get_variable(
                    "global_step", [],
                    initializer=tf.ones_initializer(),
                    trainable=False)
                learning_rate = tf.minimum(
                    global_step * (1.0 / hps.warmup_iters), 1.0) * hps.lr
            mpi_scale = tf.constant(1.0 / mpi_size)

        with tf.device("/gpu:0"):

            # Contains scope/var_name substrings we use to group gradients for all reduce
            # You'll want to find groupings that are scheduled uniquely by tensorflow, otherwise bs.allreduce could hang.
            # The groups should be ordered in which the all-reduce is called.
            # Any gradients not matching the substrings will get appended to the last group.
            grad_groups = []

            # embed discrete inputs to continous space and add learned position embeddings
            with tf.variable_scope('embed'):
                x_embed = tf.get_variable(
                    "x", [hps.n_vocab, hps.n_state],
                    initializer=tf.random_normal_initializer(stddev=0.02))
                p_embed = tf.get_variable(
                    'pos', [1, hps.n_timesteps, hps.n_state],
                    initializer=tf.random_normal_initializer(stddev=0.01))

                if hps.float16:
                    x_embed = bs.float_cast(x_embed,
                                            dtype=tf.float16,
                                            dx_dtype=tf.float16)
                    p_embed = bs.float_cast(p_embed,
                                            dtype=tf.float16,
                                            dx_dtype=tf.float16)

                # bs.embedding_lookup can be much faster than tf version for low entropy indexes or small vocabs
                x = bs.embedding_lookup(x_embed, xs)

                if train and hps.embed_pdrop > 0.0:
                    # this part of the code is not recomputed so no need to remember the generated mask returned by bs.dropout
                    x, _ = bs.dropout(x, keep_prob=1.0 - hps.embed_pdrop)
                    p_embed, _ = bs.dropout(p_embed,
                                            keep_prob=1.0 - hps.embed_pdrop)

                h = x + p_embed
                grad_groups.insert(0, 'embed')

            for l in range(hps.n_layer):
                layer_name = 'layer_%d' % l
                # enable the recompute decorator in training
                # see blocksparse/grads.py if you want understand how this works
                h = transformer_block(h,
                                      layer_name,
                                      train=train,
                                      recompute=train and hps.recompute)
                grad_groups.insert(0, layer_name)

            #average pool transformer features and apply linear classifier
            with tf.variable_scope('logits'):
                h = tf.reshape(h, [-1, hps.n_state])
                logits = tf.matmul(h, x_embed, transpose_b=True)

            if hps.float16:
                # much faster and more memory efficient (but currently only implemented in fp16)
                loss = bs.softmax_cross_entropy(logits=logits, labels=ys)
            else:
                labels = tf.cast(tf.reshape(ys, [-1]), tf.int32)
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits, labels=labels)

            loss = tf.reduce_mean(loss)

            if train:

                # apply loss scaling in fp16 mode
                if hps.float16:
                    grad_loss = bs.scale_tensor(loss, loss_scale)
                else:
                    grad_loss = loss

                # use bs.gradients to allow bs.recomputable decorators to work
                params = tf.trainable_variables()
                grads = bs.gradients(grad_loss, params)

                if mpi_size > 1:
                    # apply (1.0 / mpi_size) scaling prior to all_reduce to allow greater utilization of fp16 dynamic range.
                    # That is we're ok with flushing some small values to zero to allow growth of large values in allreduce (without hitting inf).
                    loss = bs.scale_tensor(loss, mpi_scale)
                    grads = [bs.scale_tensor(g, mpi_scale) for g in grads]

                    # allreduce in an mpi context
                    # bias and gain grads will be in fp32, but have them fp16 cast prior to allreduce
                    cast_all = tf.float16 if H.float16 else None
                    loss = bs.allreduce(loss)
                    grads = bs.group_allreduce(grads,
                                               params,
                                               search_strings=grad_groups,
                                               cast_all=cast_all)

                # This does not actually perform the clippiing, only measures the norm_scale needed to be applied.
                # norm_scale is then later applied in the fused optimizer ops (eliminating an extra pass over the gradients).
                # norm_scale is also used to detect inf/nan values in any of the gradients so the whole update can be skipped
                # and tried again with a new loss_scale.
                global_norm, norm_scale = bs.clip_by_global_norm(
                    grads, grad_scale=grad_scale, clip_norm=hps.clip_norm)

                # Apply AdamOptimizer:
                # fp16 mode is a special feature to store running mean and variance variables in custom fp16 formats.
                # Using this mode should incure no loss in accuracy and save a lot of memory in your model.
                # For futher memory savings consider using bs.AdafactorOptimizer.
                adam = bs.AdamOptimizer(learning_rate=learning_rate,
                                        norm_scale=norm_scale,
                                        grad_scale=grad_scale,
                                        fp16=hps.float16)

                train_op = adam.apply_gradients(zip(grads, params))

                # update global step after we're done using it for this update
                with tf.control_dependencies([train_op]), tf.device("/cpu:0"):
                    update_op = tf.assign_add(global_step, 1.0)

                return loss, tf.group(train_op,
                                      update_op), global_norm, norm_scale

            else:
                if mpi_size > 1:
                    loss = bs.allreduce(bs.scale_tensor(loss, mpi_scale))

                return loss
    def testBlocksparseMatMul(self):

        # layout = np.zeros((2,2), dtype=np.int32)
        # layout[0,0] = 1

        n, m = 160, 5
        layout = networkx.generators.barabasi_albert_graph(n, m)
        #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5)
        layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32)
        layout[0:m,0:m] = 1

        #layout[0:60,0:60] = 1
        #layout = np.zeros((4,4), dtype=np.int32)
        #layout = np.ones((4,4), dtype=np.int32)

        #layout[0,0] = 1

        #layout = np.ones((1,1), dtype=np.int32)
        blocks = layout.sum()
        n = layout.shape[0]
        print(100 * blocks / n**2)
        print(layout.sum(axis=0).max(), layout.sum(axis=0).min())
        #exit()

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            for bsize, axis in ( (32,0), (16,0), (8,0), ): # (32,1), (32,0), (16,0), (8,0)

                bsmm = bs.BlocksparseMatMul(layout, block_size=bsize, feature_axis=axis, name="test")

                if one:

                    W = np.ones(bsmm.w_shape, dtype=np.float32)
                    for w in range(bsmm.blocks):
                        #c, k = bsmm.block_coord(w)
                        #if c == k:
                        W[w] = np.eye(bsmm.bsize, dtype=np.float32)

                    # W = np.ones(bsmm.w_shape, dtype=np.float32)
                    # W[:] += np.arange(32, dtype=np.float32).reshape(1,1,32)
                else:
                    # W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float16).astype(np.float32)
                    W = np.random.normal(loc=0.0, scale=0.01, size=bsmm.w_shape).astype(np.float16).astype(np.float32)



                # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32)
                # for w, (c, k) in enumerate(bsmm.updat_list):
                #     WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:]

                w = tf.constant(W)

                # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) )
                # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape)
                # print("identity_init: ", (s1 - s2).max())
                # exit()

                for N in (256,128,64,32,16,8,): # 128,64,32,16,1,  256,512,1024,2048,4096, 256,1024,4096,16384

                    if one:
                        X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                        E = np.ones(bsmm.o_shape(N), dtype=np.float32)

                        # X = np.eye(bsmm.bsize, dtype=np.float32)
                        # E = np.arange(X.size, dtype=np.float32).reshape(X.shape)

                        # X[:] += np.arange(X.size, dtype=np.float32).reshape(X.shape)
                        # X[:] += np.arange(32, dtype=np.float32).reshape(32,1)
                        # E[:] += np.arange(16, dtype=np.float32).reshape(1,32)
                        # X[:] += np.arange(64, dtype=np.float32).reshape(1,64)
                        # E[:] += np.arange(64, dtype=np.float32).reshape(1,64)
                    else:
                        # X = np.random.uniform(0.0, 10.0, bsmm.i_shape(N)).astype(np.float16).astype(np.float32)
                        # E = np.random.uniform(0.0, 10.0, bsmm.o_shape(N)).astype(np.float16).astype(np.float32)
                        X = np.random.normal(loc=0.0, scale=0.1, size=bsmm.i_shape(N)).astype(np.float16).astype(np.float32)
                        E = np.random.normal(loc=0.0, scale=0.1, size=bsmm.o_shape(N)).astype(np.float16).astype(np.float32)

                    x = tf.constant(X)
                    e = tf.constant(E)

                    for dtype in dtypes:

                        print("Axis:%d Bsize:%2d N:%d dtype:%s Params:%d" % (axis, bsize, N, dtype.name, bsize*bsize*blocks))

                        # compute in tensorflow
                        if l2norm:
                            w2 = bsmm.l2_normalize(w, dtype=dtype)
                        else:
                            w2 = bs.float_cast(w, dtype=dtype)

                        y = bs.float_cast(x, dtype=dtype)

                        for j in range(depth):
                            repeat = bench if bench and j==depth-1 else 0
                            y = bsmm(y, w2, bench=repeat) # (bench and j==depth-1) (bench and j==0)

                        y = bs.float_cast(y, dtype=tf.float32)

                        #if bench: sess.run( y )
                        #y = sess.run( y )
                        with tf.control_dependencies([y.op]):
                            d = bs.gradients(y, [x, w], e)
                        if depth > 1:
                            d[1] = bs.group_param_grads(d[1], 8)

                        sess.run(tf.global_variables_initializer())


                        #y, = sess.run( [y] )
                        y, (dx, dw) = sess.run( [y, d ] )

                        if not bench:
                            # compute in numpy
                            if l2norm:
                                W2 = bsmm.l2_normalize_test(W)
                            else:
                                W2 = W

                            Ys = [X]
                            for j in range(depth):
                                Ys.append(bsmm.fprop_test(Ys[-1], W2))
                            Y = Ys.pop()

                            DW = np.zeros(bsmm.w_shape, dtype=np.float32)
                            DX = E
                            for j in range(depth):
                                DW += bsmm.updat_test(Ys.pop(), DX)
                                DX  = bsmm.bprop_test(DX, W2)
                            if l2norm:
                                DW = bsmm.l2_normalize_grad_test(W, DW)

                            for op, cpuA, devA in (
                                (" y:",  Y,  y),
                                ("dx:", DX, dx),
                                ("dw:", DW, dw),
                            ):

                                difA = abs(cpuA - devA)

                                avgval  = np.average(abs(cpuA))
                                maxdif  = difA.max()
                                max_err = maxdif if avgval == 0 else maxdif / avgval

                                l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum())

                                #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err))

                                print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err))

                                # rtol = 1e-4 if dtF is tf.float32 else 1e-1
                                # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol)
                                if out:
                                    np.savetxt("out.txt",  difA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f')
                                    np.savetxt("outC.txt", cpuA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f')
                                    np.savetxt("outD.txt", devA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f')
                                    exit()
                            print("")
    def atestSparseProj(self):
        nhidden = 1024*8
        nproj   = 1024
        N       = 64

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            if one:
                X  = np.ones((nhidden,N), dtype=np.float32)
                Y  = np.ones((  nproj,N), dtype=np.float32)
                EX = np.ones((nhidden,N), dtype=np.float32)
                EY = np.ones((  nproj,N), dtype=np.float32)
            else:
                X  = np.random.uniform(-1.0, 1.0, (nhidden,N)).astype(np.float32)
                Y  = np.random.uniform(-1.0, 1.0, (  nproj,N)).astype(np.float32)
                EX = np.random.uniform(-1.0, 1.0, (nhidden,N)).astype(np.float32)
                EY = np.random.uniform(-1.0, 1.0, (  nproj,N)).astype(np.float32)

            x  = tf.constant(X)
            y  = tf.constant(Y)
            ex = tf.constant(EX)
            ey = tf.constant(EY)

            sproj = bs.SparseProj(nhidden, nproj)
            lut   = sproj.gather_lut

            SLC = X[lut,:]
            ADD = X.copy()
            MUL = X.copy()
            ADD[lut,:] += Y
            MUL[lut,:] *= Y

            SLC_DX = np.zeros(x.shape)
            SLC_DX[lut,:] = EY

            ADD_DX = EX
            ADD_DY = EX[lut,:]

            MUL_DX = EX.copy()
            MUL_DX[lut,:] *= Y

            MUL_DY = EX[lut,:] * X[lut,:]

            slc_op = sproj.gather(x)
            mul_op = sproj.scatter_mul(x, y)
            add_op = sproj.scatter_add(x, y)

            slc = sess.run( slc_op )
            mul = sess.run( mul_op )
            add = sess.run( add_op ) # this op overwrites x, run last

            slc_dx,        = sess.run( bs.gradients(slc_op, [x  ], ey) )
            add_dx, add_dy = sess.run( bs.gradients(add_op, [x,y], ex) )
            mul_dx, mul_dy = sess.run( bs.gradients(mul_op, [x,y], ex) ) # this op overwrites ex, run last

            for op, cpuA, devA in (
                ("slc:", SLC, slc),
                ("add:", ADD, add),
                ("mul:", MUL, mul),
                ("slc_dx:", SLC_DX, slc_dx),
                ("add_dx:", ADD_DX, add_dx),
                ("add_dy:", ADD_DY, add_dy),
                ("mul_dx:", MUL_DX, mul_dx),
                ("mul_dy:", MUL_DY, mul_dy),
            ):

                difA = abs(cpuA - devA)

                avgval  = np.average(abs(cpuA))
                maxdif  = difA.max()
                max_err = maxdif if avgval == 0 else maxdif / avgval

                l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum())
                print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err))

                if out:
                    np.savetxt("out.txt",  difA, fmt='%5.1f')
                    np.savetxt("outC.txt", cpuA, fmt='%5.1f')
                    np.savetxt("outD.txt", devA, fmt='%5.1f')
                    exit()
    def atestBlocksparseMatMulGated(self):

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            N = 128
            K = 8*56*2*4
            n = K//8
            m = 30
            dtype = tf.float32
            repeat = 0
            dw_gated = False
            block_size = 8

            layout = networkx.generators.barabasi_albert_graph(n, m)
            layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32)
            layout[0:m,0:m] = 1

            blocks = layout.sum()
            n = layout.shape[0]
            print(100 * blocks / n**2)
            print(layout.sum(axis=0).max())

            # layout = np.ones((112,32), dtype=np.int32)
            bsmm = bs.BlocksparseMatMul(layout, block_size=block_size, feature_axis=0, name="test")

            if one:
                X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                E = np.ones(bsmm.o_shape(N), dtype=np.float32)
                W = np.ones(bsmm.w_shape   , dtype=np.float32)
                G = np.ones(bsmm.blocks    , dtype=np.float32)
            else:
                X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(N)).astype(np.float32)
                E = np.random.uniform(-1.0, 1.0, bsmm.o_shape(N)).astype(np.float32)
                W = np.random.uniform(-1.0, 1.0, bsmm.w_shape   ).astype(np.float32)
                G = np.random.uniform( 0.0, 1.0, bsmm.blocks    ).astype(np.float32)

            G = np.ones(bsmm.blocks, dtype=np.float32)
            for w, (c, k) in enumerate(bsmm.updat_list):
                G[w] = (c & 1) ^ (k & 1) ^ 1

            #G[::2] = 0.0

            # block = dict()
            # for w, (c, k) in enumerate(bsmm.updat_list):
            #     block[(c,k)] = w

            # grid = []
            # for c in range(bsmm.CB):
            #     row = []
            #     for k in range(bsmm.KB):
            #         row.append(G[block[(c,k)]])
            #     grid.append(row)

            # for row in grid:
            #     print(row)

            # exit()


            x = tf.constant(X)
            e = tf.constant(E)
            w = tf.constant(W)
            g = tf.constant(G)

            wf = bs.float_cast(w, dtype=dtype)
            xf = bs.float_cast(x, dtype=dtype)

            y = bsmm(xf, wf, gate=g, gate_grad=True, dw_gated=dw_gated, bench=repeat)

            y = bs.float_cast(y, dtype=tf.float32)

            d = bs.gradients(y, [x, w], e)

            sess.run( tf.global_variables_initializer() )

            y, (dx, dw) = sess.run( [y, d] )

            # gpu kernel doesn't touch zero gate blocks
            # for b in range(bsmm.blocks):
            #     if G[b] == 0.0:
            #         dw[b,:,:] = 0.0

            Y  = bsmm.fprop_test(X, W, gate=G)
            DX = bsmm.bprop_test(E, W, gate=G)
            DW = bsmm.updat_test(X, E, gate=G, dw_gated=dw_gated)

            #print(Y.shape, dtype)

            for op, cpuA, devA in (
                (" y:",  Y,  y),
                ("dx:", DX, dx),
                ("dw:", DW, dw),):

                difA = abs(cpuA - devA)

                avgval  = np.average(abs(cpuA))
                maxdif  = difA.max()
                max_err = maxdif if avgval == 0 else maxdif / avgval

                l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum() + 1e-12)

                print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err))

                if out:
                    dim = K if op == "dw:" else N
                    np.savetxt("out.txt",  difA.reshape((-1,dim)), fmt='%5.1f')
                    np.savetxt("outC.txt", cpuA.reshape((-1,dim)), fmt='%5.1f')
                    np.savetxt("outD.txt", devA.reshape((-1,dim)), fmt='%5.1f')
                    exit()