Exemple #1
0
def true_density(z):
    z1, z2 = z[0], z[1]
    norm = C.sqrt(C.square(z1) + C.square(z2))
    exp1 = C.exp(-0.5 * C.square((z1 - 2) / 0.8))
    exp2 = C.exp(-0.5 * C.square((z1 + 2) / 0.8))
    u = 0.5 * C.square(((norm - 4) / 0.4)) - C.log(exp1 + exp2)
    return C.exp(-u)
 def instance_normalization(x):
     mean = C.reduce_mean(x, axis=(1, 2))
     x0 = x - mean
     std = C.sqrt(C.reduce_mean(x0 * x0, axis=(1, 2)))
     if epsilon != 0:
         std += epsilon
     x_hat = x0 / std
     return x_hat * C.reshape(scale, (-1, 1, 1)) + C.reshape(bias, (-1, 1, 1))
Exemple #3
0
def test_grad_custimized_root():
    x = C.input_variable(shape=(1,), needs_gradient=True)
    y = C.sqrt(x)
    y2 = C.log(x)
    combine = C.combine([y.output, y2.output])
    a = np.asarray([1,4,16], dtype=np.float32).reshape(3,1)
    grads = combine.grad({x:a}, grad_root = y.output)
    expect_grad = np.asarray([[0.5],[0.25],[0.125]], dtype=np.float32)
    assert np.array_equal(grads, expect_grad)
Exemple #4
0
def test_grad_custimized_root():
    x = C.input(shape=(1, ), needs_gradient=True)
    y = C.sqrt(x)
    y2 = C.log(x)
    combine = C.combine([y.output, y2.output])
    a = np.asarray([1, 4, 16], dtype=np.float32).reshape(3, 1)
    grads = combine.grad({x: a}, grad_root=y.output)
    expect_grad = np.asarray([[0.5], [0.25], [0.125]], dtype=np.float32)
    assert np.array_equal(grads, expect_grad)
Exemple #5
0
    def squash(input):

        # ||Sj||^2
        Sj_squared_norm = ct.reduce_sum(ct.square(input), axis=axis)

        # ||Sj||^2 / (1 + ||Sj||^2) * (Sj / ||Sj||)
        factor = ct.element_divide(
            ct.element_divide(Sj_squared_norm, ct.plus(1, Sj_squared_norm)),
            ct.sqrt(ct.plus(Sj_squared_norm, epsilon)))
        return factor * input
Exemple #6
0
    def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0):
        #W=tf.transpose(W, [0,2,3,1])
        
        arrs=array.shape
        ashp=W.shape
        sb=(W.shape[1],1,1)
        WV=W.shape[-2:]
        xi=(-2,-1)
        x2=(-2,-1,-3)

        if V:
            print(W.eval())
            print(arrs,ashp)
        mul=(array*W)

        if V:
            print('Wsamp',W[-1,-1].eval())
            print('array*w',(mul.eval())[0,-1])

        size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel)

        if V:
            print("sizesamp",size.shape,size.eval())
        if B is None:
            B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel
        B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))]))
        if sizz==1:
            mean=C.reduce_sum(mul,axis=xi)/size
        else:
            mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32)
        if V:
            print("meansamp",mean.eval()[0,-1])
        if square:
            i=(C.square(mul-mean)+B)
        else:
            i=(((mul)-mean)+B)
        di=i/size
        if V==2:
            print("i",i.eval(),"i")
            print("di",di.eval(),"di")
        if V:
            print('isamp',i.shape,i.eval()[-1,-1,])
        out=C.reduce_sum(i+B,axis=x2)
        #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1)
        print(out.shape)
        if sqrt:
            out=C.sqrt(out)
        out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1)
        print(out.shape)
        assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2])
        return(out)
Exemple #7
0
    def attention(query, key, value):
        dk = C.sqrt(C.reduce_sum(C.ones_like(
            query)))  # cannot use sequence.last, will conflict with recurrence
        # dk: [#, *] [1, ] and value = int(dim_of_query)

        unpacked_key = C.sequence.unpack(
            key, padding_value=0, no_mask_output=True)  # [#] [-3, key_dim]
        unpacked_value = C.sequence.unpack(
            value, padding_value=0, no_mask_output=True)  # [#] [-3, value_dim]

        broadcasted_key = C.sequence.broadcast_as(
            unpacked_key, query)  # [#, *] [-3, key_dim]
        scaled = C.times_transpose(query, broadcasted_key) / dk
        # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim
        # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score

        # masked out invalid temporal connections to obey_sequence_order
        if obey_sequence_order and max_seq_len:
            unpacked_scaled, scaled_mask = C.sequence.unpack(
                scaled, padding_value=0).outputs
            # unpacked_scaled: [#] [-3, -3]  <== matrix will be top right diagonally zero-ed
            # scaled_mask: [#] [-3,]

            minus_inf = C.constant(-1e+30)
            valid_connections = C.Constant(
                np.tril(np.ones((max_seq_len, max_seq_len)),
                        k=0))  # [] [max_seq, max_seq]
            valid_connections = C.reconcile_dynamic_axes(
                valid_connections, unpacked_scaled)  # [#] [max_seq, max_seq]
            valid_connections = C.crop_manual(valid_connections,
                                              unpacked_scaled, 0,
                                              0)  # [#] [-3, -3]
            unpacked_scaled = C.element_select(valid_connections,
                                               unpacked_scaled,
                                               minus_inf)  # [#] [-3, -3]
            scaled = C.to_sequence_like(unpacked_scaled, query)  # [#, *] [-3]

        elif obey_sequence_order and not max_seq_len:
            raise ValueError(
                "max_seq_len must be defined when obey_sequence_order is True")

        attended = C.times(C.softmax(scaled, axis=-1),
                           C.sequence.broadcast_as(
                               unpacked_value, query))  # [#, *] [value_dim,]
        return attended
Exemple #8
0
def layer_normalization(inputs: C.Function,
                        name='layer_normalization') -> C.Function:
    X = C.placeholder(
        inputs.shape,
        (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()),
        name=name + '_ph')

    mu = C.reduce_mean(X, name='mu')
    sigma = C.sqrt(C.reduce_mean(C.square(X - mu)), name='sigma')

    result = (X - mu) / sigma

    #region scale + bias
    scale = C.parameter(inputs.shape, init=1, name='scale')
    bias = C.parameter(inputs.shape, init=0, name='bias')
    result = result * scale + bias
    #endregion

    block = C.as_block(result, [(X, X)], name)

    return block(inputs)
Exemple #9
0
def sqrt(x, name=''):
    '''
    Computes the element-wise square-root of `x`: 

    :math:`sqrt(x) = {\sqrt[2]{x}}`

    Example:
        >>> C.eval(C.sqrt([0., 4.]))
        [array([[ 0.      ,  2.]])]

    Args:
        x: numpy array or any :class:`cntk.Function` that outputs a tensor
        name (str): the name of the node in the network
    Returns:
        :class:`cntk.Function`        
        
    Note:
        CNTK returns zero for sqrt of negative nubmers, this will be changed to 
        retrun NaN
    '''
    from cntk import sqrt
    x = sanitize_input(x)
    return sqrt(x, name).output()    
Exemple #10
0
def sqrt(x, name=''):
    '''
    Computes the element-wise square-root of `x`: 

    :math:`sqrt(x) = {\sqrt[2]{x}}`

    Example:
        >>> C.eval(C.sqrt([0., 4.]))
        [array([[ 0.      ,  2.]])]

    Args:
        x: numpy array or any :class:`cntk.Function` that outputs a tensor
        name (str): the name of the node in the network
    Returns:
        :class:`cntk.Function`        
        
    Note:
        CNTK returns zero for sqrt of negative nubmers, this will be changed to 
        retrun NaN
    '''
    from cntk import sqrt
    x = sanitize_input(x)
    return sqrt(x, name).output()    
def test_sqrt():
    assert_cntk_ngraph_isclose(C.sqrt([0., 4.]))
    assert_cntk_ngraph_isclose(C.sqrt([[1, 2], [3, 4]]))
    assert_cntk_ngraph_isclose(C.sqrt([[[1, 2], [3, 4]], [[1, 2], [3, 4]]]))
Exemple #12
0
# Create CNTK inputs
input = C.input_variable(input_dim)
label = C.input_variable(num_output_classes)


def create_model(features):
    with C.layers.default_options(init=C.glorot_uniform()):
        r = C.layers.Dense(num_output_classes, activation=None)(features)
        return r


# Scale the input to 0-1 range by dividing each pixel by 255
# z represents the output of the network -> z = Wx' + b
input_s = input / 255
squared_input = C.square(input_s)
sqrted_input = C.sqrt(input_s)

normalized_input = C.splice(input_s, squared_input, sqrted_input)
z = create_model(normalized_input)

# Define loss to minimize the cross-entropy between the label and predicted
# probability by the network
loss = C.cross_entropy_with_softmax(z, label)

# Define the evaluation (metric) function to report how well our model is performing
label_error = C.classification_error(z, label)

# Instantiate the trainer object to drive the model training
learning_rate = 0.2
lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
learner = C.sgd(z.parameters, lr_schedule)
Exemple #13
0
def self_attention_layer(in_dims: int,
                         out_dims: int,
                         name='self_attention',
                         as_block: bool = False,
                         k_ph: bool = False,
                         v_ph: bool = False,
                         mask_opt: bool = False) -> C.Function:
    sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims')

    X = C.placeholder(
        in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()),
        name=name + '_ph')

    if k_ph is False and v_ph is False:
        q = C.layers.Dense(out_dims, name=name + '_q')(
            X
        )  # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q')
        k = C.layers.Dense(out_dims, name=name + '_k')(
            X
        )  # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k')
        v = C.layers.Dense(out_dims, name=name + '_v')(
            X
        )  # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v')
    elif k_ph is True and v_ph is True:
        q = C.layers.Dense(out_dims, name=name + '_q')(X)
        k = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_k_ph')
        v = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_v_ph')
    else:
        raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}')

    q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q')
    k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k')
    v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v')

    scores = C.times_transpose(q_, k_, name=name + '_score_matrix')
    scaled = scores / sq_sa_dims  # div_k

    if mask_opt:
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')
        scaled = C.element_min(scaled, inf_mask)

    softmax = C.softmax(scaled, name=name + '_softmax')
    attention = C.times(softmax, v_, name=name + '_attention')

    result = C.to_sequence_like(attention, X)

    if as_block:
        if k_ph is False and v_ph is False:
            return C.as_block(result, [(X, X)], 'self_attention',
                              'self_attention_')
        elif k_ph is True and v_ph is True:
            return C.as_block(result, [(X, X), (k, k), (v, v)],
                              'self_attention', 'self_attention_')
        else:
            raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}')
    else:
        return result
Exemple #14
0
def test_Sqrt(tmpdir, dtype):
    with C.default_options(dtype = dtype):
        model = C.sqrt(np.array([0., 4.]).astype(dtype))
        verify_no_input(model, tmpdir, 'Sqrt_0')
Exemple #15
0
def flow_forward(input_dim: int, act_func_pair: tuple = (None, None), batch_norm: bool = False):
    chunk = {}
    log_det_J = 0

    chunk['input_dim'] = input_dim
    _ph = C.placeholder(input_dim, name='place_holder')
    _out = _ph

    if batch_norm:
        # _bn = C.layers.BatchNormalization(name='batch_norm')(_ph)
        # chunk['scale'] = _bn.parameters[0]
        # chunk['bias'] = _bn.parameters[1]

        chunk['mu'] = C.Constant(np.zeros(shape=input_dim))
        chunk['var'] = C.Constant(np.ones(shape=input_dim))

        _eps = C.Constant(1e-7)
        _mu = C.reduce_mean(_ph, axis=C.Axis.default_batch_axis())
        _var = C.reduce_mean(C.square(_ph-_mu), axis=C.Axis.default_batch_axis())

        chunk['muB'] = _mu
        chunk['varB'] = _var

        # _bn = (_ph-chunk['mu'])/C.sqrt(chunk['var']+_eps)
        _bn = C.sqrt(chunk['var']+_eps)*_ph + chunk['mu']
        _ph = _bn

        log_det_J += -0.5*C.reduce_sum(C.log((_var+_eps)))
        # log_det_J += C.reduce_sum(C.log())

    chunk['W_rot_mat'] = _W = C.parameter((input_dim, input_dim))
    _W.value = random_rotation_matrix = special_ortho_group.rvs(input_dim)
    # _W.value = np.roll(np.eye(input_dim),input_dim//2,axis=0)
    _out = _ph@_W
    log_det_J += C.log(C.abs(C.det(_W))) # or # log_det_J += C.slogdet(_W)[1]
    
    _half_dim = input_dim//2
    _x1 = _out[:_half_dim]
    _x2 = _out[_half_dim:]

    _log_s_func, _t_func = act_func_pair
    if _log_s_func is None: # basic network
        _log_s_func = C.layers.Sequential([
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(_half_dim, C.tanh),
        ])#(C.placeholder(input_dim, name='place_holder'))
    if _t_func is None: # basic network
        _t_func = C.layers.Sequential([
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(_half_dim),
        ])#(C.placeholder(input_dim, name='place_holder'))

    chunk['log_s_func'] = _log_s_func
    chunk['t_func'] = _t_func

    _log_s, _t = _log_s_func(_x2), _t_func(_x2)

    _s = C.exp(_log_s)

    _y1 = _s*_x1 + _t
    _y2 = _x2

    _Y = C.splice(_y1, _y2)
    chunk['output'] = _Y

    log_det_J += C.reduce_sum(_log_s)

    return _Y, log_det_J, chunk
Exemple #16
0
def test_Sqrt(tmpdir):
    model = C.sqrt([0., 4.])
    verify_no_input(model, tmpdir, 'Sqrt_0')
# Define the layer dimensions
num_hidden_layers = 2
hidden_layers_dim = 400

def create_model(features):
    with cntk.layers.default_options(init = cntk.glorot_uniform(), activation = cntk.ops.relu):
        input = features
        for _ in range(num_hidden_layers):
            input = cntk.layers.Dense(hidden_layers_dim)(input)
        r = cntk.layers.Dense(num_output_classes, activation = None)(input)
        return r

# Scale the input to 0-1 range by dividing each pixel by 255.
input_s_normalized = input/255.0
input_s_squared = cntk.square(input_s_normalized)
input_s_sqrt = cntk.sqrt(input_s_normalized)
z_model = create_model(input_s_normalized)

# Define the loss function for is_training
loss = cntk.cross_entropy_with_softmax(z_model, label)

# Classification error evaluation
label_error = cntk.classification_error(z_model, label)

# Configure training parameters
# Instantiate the trainer object to drive the model training
learning_rate = 0.2
lr_schedule = cntk.learning_rate_schedule(learning_rate, cntk.UnitType.minibatch)

# Schoastic Gradient Descent learner
learner = cntk.sgd(z_model.parameters, lr_schedule)
Exemple #18
0
if not os.path.exists(data_dir):
    data_dir = os.path.join("data", "MNIST")

print('Writing train text file...')
savetxt(os.path.join(data_dir, "Train-28x28_cntk_text.txt"), train)

print('Writing test text file...')
savetxt(os.path.join(data_dir, "Test-28x28_cntk_text.txt"), test)

print('Done')

input = C.input_variable(input_dim)
label = C.input_variable(num_output_classes)
normalize_input = input / 255.0
squared_input = C.square(input / 255.0)
sqrt_input = C.sqrt(input / 255.0)

z = create_model(C.splice(normalize_input, squared_input, sqrt_input))

loss = C.cross_entropy_with_softmax(z, label)

label_error = C.classification_error(z, label)

lr_schedule = C.learning_parameter_schedule(learning_rate)

learner = C.sgd(z.parameters, lr_schedule)

trainer = C.Trainer(z, (loss, label_error), [learner])

data_found = False
Exemple #19
0
 def length(input):
     return ct.reshape(
         ct.sqrt(ct.reduce_sum(ct.square(input), axis=1) + epsilon),
         (10, 1))
Exemple #20
0
def test_Sqrt(tmpdir, dtype):
    with C.default_options(dtype=dtype):
        model = C.sqrt(np.array([0., 4.]).astype(dtype))
        verify_no_input(model, tmpdir, 'Sqrt_0')
#%%
input = C.input_variable(input_dim)
label = C.input_variable(num_output_classes)


#%%
def create_model(features):
    with C.layers.default_options(init=C.glorot_uniform()):
        r = C.layers.Dense(num_output_classes, activation=None)(features)
        return r


#%%
# Scale the input to 0-1 range by dividing each pixel by 255.
input_s = input / 255
input_s = C.splice(input_s, C.sqrt(input_s), C.square(input_s))
z = create_model(input_s)

#%%
loss = C.cross_entropy_with_softmax(z, label)

#%%
label_error = C.classification_error(z, label)

#%%
# Instantiate the trainer object to drive the model training
learning_rate = 0.2
lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
learner = C.sgd(z.parameters, lr_schedule)
trainer = C.Trainer(z, (loss, label_error), [learner])
Exemple #22
0
def test_Sqrt(tmpdir):
    model = C.sqrt([0., 4.])
    verify_no_input(model, tmpdir, 'Sqrt_0')
Exemple #23
0
def gpt2_self_attention(token_dims: int,
                        head_dims: int,
                        mask_opt: bool = False,
                        as_block: bool = False,
                        name: str = 'self_attention'):
    X = C.placeholder(token_dims,
                      dynamic_axes=(C.Axis.default_batch_axis(),
                                    C.Axis.default_dynamic_axis()),
                      name=name)

    # q = C.layers.Dense(token_dims, name=name+'_q')(X)
    # k = C.layers.Dense(token_dims, name=name+'_k')(X)
    # v = C.layers.Dense(token_dims, name=name+'_v')(X)

    # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w')
    # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv')

    qkv = C.layers.Dense((3, token_dims), name='qkv')(X)
    q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2]

    q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q')
    k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k')
    v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v')

    #region split multi head attention
    q_heads = [
        C.squeeze(q_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    k_heads = [
        C.squeeze(k_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    v_heads = [
        C.squeeze(v_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    #endregion

    attention_head = []
    for i in range(head_dims):
        q = q_heads[i]
        k = k_heads[i]
        v = v_heads[i]

        #region score
        # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present
        q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i))  # q seq
        k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i))  # k seq
        v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i))  # v seq

        scores = C.times_transpose(q_, k_)
        scaled = scores * (1 / C.sqrt(v_.shape[-1]))

        #region mask opt
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')

        scaled = C.element_min(scaled, inf_mask)
        #endregion

        softmax = C.softmax(scaled)
        #endregion
        #region sum
        attention = C.times(softmax, v_)
        attention_seq = C.to_sequence_like(attention, X)
        #endregion
        attention_head.append(attention_seq)


#region merge attention heads
    attention = C.splice(*attention_head, name='merged_attention')
    #endergion

    #region project
    project = C.layers.Dense(token_dims, name='project')(attention)
    #endregion

    if as_block:
        return C.as_block(project, [(X, X)], 'gpt2_self_attention',
                          'gpt2_self_attention')

    return project