def SCNN(vst_onlyTokens, dl_terms, dl_associations, vso, nbEpochs=150, batchSize=64, l_numberOfFilters=[4000], l_filterSizes=[1], phraseMaxSize=15): data, labels, l_unkownTokens, l_uncompleteExpressions = prepare2D_data( vst_onlyTokens, dl_terms, dl_associations, vso, phraseMaxSize) embeddingSize = data.shape[2] ontoSpaceSize = labels.shape[2] inputLayer = Input(shape=(phraseMaxSize, embeddingSize)) l_subLayers = list() for i, filterSize in enumerate(l_filterSizes): convLayer = (layers.Conv1D( l_numberOfFilters[i], filterSize, strides=1, kernel_initializer=initializers.GlorotUniform()))(inputLayer) outputSize = phraseMaxSize - filterSize + 1 pool = (layers.MaxPool1D(pool_size=outputSize))(convLayer) activationLayer = (layers.LeakyReLU(alpha=0.3))(pool) l_subLayers.append(activationLayer) if len(l_filterSizes) > 1: concatenateLayer = (layers.Concatenate(axis=-1))( l_subLayers) # axis=-1 // concatenating on the last dimension else: concatenateLayer = l_subLayers[0] convModel = Model(inputs=inputLayer, outputs=concatenateLayer) fullmodel = models.Sequential() fullmodel.add(convModel) fullmodel.add( layers.Dense(ontoSpaceSize, kernel_initializer=initializers.GlorotUniform())) fullmodel.summary() fullmodel.compile( optimizer=optimizers.Nadam(), loss=losses.LogCosh(), metrics=[metrics.CosineSimilarity(), metrics.MeanSquaredError()]) fullmodel.fit(data, labels, epochs=nbEpochs, batch_size=batchSize) return fullmodel, vso, l_unkownTokens
def __initialize_weights_and_biases(self, xavier): self.weights = [ initializers.GlorotUniform()( shape=(j, i)).numpy() if xavier else np.random.randn(j, i) for i, j in zip(self.shape[:-1], self.shape[1:]) ] self.biases = [np.random.randn(i, 1) for i in self.shape[1:]]
def SLFNN(vst_onlyTokens, dl_terms, dl_associations, vso, nbEpochs=100, batchSize=64): vstTerm, l_unknownToken = word2term.wordVST2TermVST( vst_onlyTokens, dl_terms) data, labels = getMatrix(dl_terms, vstTerm, dl_associations, vso, symbol="___") inputSize = data.shape[1] ontoSpaceSize = labels.shape[1] model = models.Sequential() model.add( layers.Dense(units=ontoSpaceSize, use_bias=True, kernel_initializer=initializers.GlorotUniform(), input_shape=(inputSize, ))) model.summary() model.compile( optimizer=optimizers.Nadam(), loss=losses.LogCosh(), metrics=[metrics.CosineSimilarity(), metrics.MeanSquaredError()]) model.fit(data, labels, epochs=nbEpochs, batch_size=batchSize) return model, vso, l_unknownToken
def __init__(self,fin,fout=1): super(TemporalAttention,self).__init__() self.fin = fin # 输入维度 self.fout = fout # 输出维度 这里为1 求得是分数 self.initializer = initializers.GlorotUniform() # 初始化分布 # 自定义可学习参数 self.w = tf.Variable(self.initializer(shape=[self.fin, self.fout], dtype=tf.float32))
def __init__(self,fout): super(Decoder,self).__init__() self.fout = fout self.fc = layers.Dense(self.fout) self.resweight = tf.Variable(0.0,trainable=True) # 线性函数 self.initializer = initializers.GlorotUniform() self.bias = tf.Variable(self.initializer(shape=[self.fout], dtype=tf.float32))
def computeInitializer(cls, seed=None): """ Compute layer initializer Parameters: - seed -- seed for random generator used by analyzer If it is None a unique repeatable seed is generated """ _seed = seed if seed else UniqueSeed.getSeed() #_initializer=initializers.GlorotNormal(seed=seed) _initializer = initializers.GlorotUniform(seed=seed) return _initializer
def __init__(self, fout): super(GraphNodes, self).__init__() self.fout = fout # 16 self.thresold = 1e-12 # eps self.fc = layers.Dense(self.fout, use_bias=False) self.leakyrelu = layers.LeakyReLU(alpha=0.2) self.initializer = initializers.GlorotUniform() self.bias = tf.Variable( self.initializer(shape=[self.fout], dtype=tf.float32))
def __init__(self, p): super(EmbeddingNetwork, self).__init__() self.p = p self.theta = layers.Dense(p, input_shape=(None, p)) self.theta4 = tf.Variable(initializers.GlorotUniform()(shape=(1, p)), trainable=True, dtype=tf.float32) self.relu_for_outputs = layers.ReLU()
def __init__(self, W_reg='l2', b_reg='l2', W_constraint='MinMaxNorm', b_constraint='MinMaxNorm', output_attention=False, **kwargs): self.initializer = initializers.GlorotUniform() self.weight_regularizers = regularizers.get(W_reg) self.bias_regularizers = regularizers.get(b_reg) self.weight_constraint = constraints.get(W_constraint) self.bias_constraint = constraints.get(b_constraint) self.output_attention = output_attention super(Attention, self).__init__(dtype='float32', **kwargs)
def __init__(self, p): super(Structure2Vec, self).__init__() self.p = p self.theta1 = layers.Dense(p, input_shape=(None, 1)) self.theta2 = layers.Dense(p, input_shape=(None, p)) self.theta3 = layers.Dense(p, input_shape=(None, p)) self.theta4 = tf.Variable(initializers.GlorotUniform()(shape=(1, p)), trainable=True, dtype=tf.float32) self.relu_for_unit4 = layers.ReLU() self.relu_for_outputs = layers.ReLU()
def __init__(self, f_gcn, f_atten, channels=4): super(EGCN, self).__init__() self.f_gcn = f_gcn self.f_atten = f_atten self.channels = channels # initialize custom parameters self.initializer = initializers.GlorotUniform() self.w_atten = tf.Variable( self.initializer(shape=[self.channels, self.f_atten], dtype=tf.float32)) # w_atten self.bn = layers.BatchNormalization() # bn self.w = layers.Dense(self.f_gcn) # fc self.bn2 = layers.BatchNormalization()
def conv2d_layer(self, layer_metadata): if layer_metadata["initializer"] == "xavier": initializer = initializers.GlorotUniform() elif layer_metadata["initializer"] == "random": initializer = initializers.RandomNormal() else: raise ValueError( "Specified initializer for {} is invalid: should be one of (xavier, random)" .format(layer_metadata)) if layer_metadata["regularizer"] not in ("l1", "l2", None): raise ValueError( "Specified regularizer for {} is invalid: should be one of (l1, l2, None)" .format(layer_metadata)) else: regularizer = self.get_regularizer(layer_metadata["regularizer"], layer_metadata["reg_ratio"]) if layer_metadata["activation"] not in ("relu", "sigmoid, softmax", "tanh"): raise ValueError( "Activation specified for {} is invalid, should be one of (relu, sigmoid, softmax, tanh)" ) if "batch_norm" in layer_metadata.keys( ) and layer_metadata["batch_norm"] == True: return layers.Conv2D(filters=layer_metadata["filters"], kernel_size=layer_metadata["kernel_size"], strides=layer_metadata["strides"], padding=layer_metadata["padding"], data_format=self.data_format, activation=None, kernel_initializer=initializer, bias_initializer=initializer, kernel_regularizer=regularizer, bias_regularizer=regularizer) return layers.Conv2D(filters=layer_metadata["filters"], kernel_size=layer_metadata["kernel_size"], strides=layer_metadata["strides"], padding=layer_metadata["padding"], data_format=self.data_format, activation=layer_metadata["activation"], kernel_initializer=initializer, bias_initializer=initializer, kernel_regularizer=regularizer, bias_regularizer=regularizer)
def cal_states_similarity(self, state): seed = 0 input_t = tf.convert_to_tensor(state, dtype=np.float32) x = Lambda(lambda x: x / 255., name="input_normalizer")(input_t) x = TimeDistributed( Conv2D(filters=32, kernel_size=6, strides=6, kernel_initializer=initializers.GlorotUniform(seed), input_shape=x.shape))(x) x = LeakyReLU(0.01)(x) x = TimeDistributed(MaxPooling2D())(x) x = Flatten()(x) state = x.numpy() dist = np.linalg.norm(state[0, :] - state[1, :]) return dist
def __init__(self, n_head, f_in, f_out, attn_dropout, bias=True): super(BatchMultiHeadGraphAttention, self).__init__() self.n_head = n_head # 头大小 self.f_in = f_in # 输入大小 self.f_out = f_out # 输出大小 self.attn_dropout = attn_dropout # dropout self.add_self_loop = True # 为防止没有邻居结点出现的情况 self.initializer = initializers.GlorotUniform() # 初始化分布 self.w = tf.Variable( self.initializer(shape=[self.n_head, self.f_in, self.f_out], dtype=tf.float32)) # 自定义参数 权重 self.adj = [] self.fc = tf.Variable( self.initializer(shape=[self.n_head, 2 * self.f_out, 1], dtype=tf.float32)) # 自定义参数 att self.leaky_relu = layers.LeakyReLU(alpha=0.2) # 激活函数 self.softmax = layers.Softmax(axis=-1) # 归一层 self.dropout = layers.Dropout(rate=self.attn_dropout) # Dropout 层 if bias: self.bias = tf.Variable(tf.zeros(self.f_out)) # 自定义参数 偏置
def __init__(self, n_head, f_in, f_out, attn_dropout, bias=True): super(MultiHeadGraphAttention, self).__init__() self.n_head = n_head # 头大小 self.f_in = f_in # 输入大小 self.f_out = f_out # 输出大小 self.isbias = bias # 偏置 self.initializer = initializers.GlorotUniform() # 初始化分布 self.w = tf.Variable( self.initializer(shape=[self.n_head, self.f_in, self.f_out], dtype=tf.float32)) # 自定义参数 权重 self.a_src = tf.Variable( self.initializer(shape=[self.n_head, self.f_out, 1], dtype=tf.float32)) # 自定义参数 self.a_dst = tf.Variable( self.initializer(shape=[self.n_head, self.f_out, 1], dtype=tf.float32)) # 自定义参数 self.leaky_relu = layers.LeakyReLU(alpha=0.2) # 激活函数 self.softmax = layers.Softmax(axis=-1) # 归一层 self.dropout = layers.Dropout(rate=attn_dropout) # Dropout 层 if self.isbias: self.bias = tf.Variable(tf.zeros(self.f_out)) # 自定义参数 偏置
def CNorm(vst_onlyTokens, dl_terms, dl_associations, vso, nbEpochs=30, batchSize=64, l_numberOfFilters=[4000], l_filterSizes=[1], phraseMaxSize=15): # Preparing data for SLFNN and S-CNN components: dataSCNN, labels, l_unkownTokens, l_uncompleteExpressions = prepare2D_data( vst_onlyTokens, dl_terms, dl_associations, vso, phraseMaxSize) dataSLFNN = numpy.zeros((dataSCNN.shape[0], dataSCNN.shape[2])) for i in range(dataSCNN.shape[0]): numberOfToken = 0 for embedding in dataSCNN[i]: if not numpy.any(embedding): pass else: numberOfToken += 1 dataSLFNN[i] += embedding if numberOfToken > 0: dataSLFNN[i] = dataSLFNN[i] / numberOfToken # Input layers: inputLP = Input(shape=dataSLFNN.shape[1]) inputCNN = Input(shape=[dataSCNN.shape[1], dataSCNN.shape[2]]) # SLFNN component: ontoSpaceSize = labels.shape[2] denseLP = layers.Dense( units=ontoSpaceSize, use_bias=True, kernel_initializer=initializers.GlorotUniform())(inputLP) modelLP = Model(inputs=inputLP, outputs=denseLP) # Shallow-CNN component: l_subLayers = list() for i, filterSize in enumerate(l_filterSizes): convLayer = (layers.Conv1D( l_numberOfFilters[i], filterSize, strides=1, kernel_initializer=initializers.GlorotUniform()))(inputCNN) outputSize = phraseMaxSize - filterSize + 1 pool = (layers.MaxPool1D(pool_size=outputSize))(convLayer) activationLayer = (layers.LeakyReLU(alpha=0.3))(pool) l_subLayers.append(activationLayer) if len(l_filterSizes) > 1: concatenateLayer = (layers.Concatenate(axis=-1))( l_subLayers) # axis=-1 // concatenating on the last dimension else: concatenateLayer = l_subLayers[0] denseLayer = layers.Dense( ontoSpaceSize, kernel_initializer=initializers.GlorotUniform())(concatenateLayer) modelCNN = Model(inputs=inputCNN, outputs=denseLayer) convModel = Model(inputs=inputCNN, outputs=concatenateLayer) fullmodel = models.Sequential() fullmodel.add(convModel) # Combination of the two components: combinedLayer = layers.average([modelLP.output, modelCNN.output]) fullModel = Model(inputs=[inputLP, inputCNN], outputs=combinedLayer) fullModel.summary() # Compile and train: fullModel.compile( optimizer=optimizers.Nadam(), loss=losses.LogCosh(), metrics=[metrics.CosineSimilarity(), metrics.MeanSquaredError()]) fullModel.fit([dataSLFNN, dataSCNN], labels, epochs=nbEpochs, batch_size=batchSize) return fullModel, vso, l_unkownTokens
unmask_count = total - mask_count print(total, unmask_count, mask_count) weight_for_0 = (1 / unmask_count) * (total) / 2.0 weight_for_1 = (1 / mask_count) * (total) / 2.0 class_weights = {0: weight_for_0, 1: weight_for_1} print("Done, class_weights: ", class_weights) import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Activation, Flatten, Dropout from tensorflow.keras.layers import Conv2D, MaxPooling2D from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras import initializers initializer = initializers.GlorotUniform() print("Compiling ML models") model = Sequential() model.add( Conv2D(32, (3, 3), input_shape=data.shape[1:], kernel_initializer=initializer, padding="same")) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) #The first CNN layer followed by Relu and MaxPooling layers model.add(Conv2D(64, (3, 3), kernel_initializer=initializer, padding="same")) model.add(Activation('relu'))
def __init__(self, name=None, d_model=512, d_proj=64, n_heads=8, use_bias=False, qkvw_init_scale=[1, 1, 1, 1], apply_softmax=True, sparse_pattern=None, sparse_block_size=16, trainable=True, relative_attention_type=False, shape_2d=None, share_pe_heads=True, pe_initialization='uniform', cls_token=True, q_initializer='uniform', k_initializer='uniform', v_initializer='uniform', o_initializer='uniform'): super(MultiheadAttention, self).__init__(name=name) self.d_model = d_model self.d_proj = d_proj self.n_heads = n_heads self.sparse_pattern = sparse_pattern if sparse_pattern is None: self.scaled_dot_product_attention = ScaledDotProductAttention( name=self.name + '_attention', apply_softmax=apply_softmax, relative_attention_type=relative_attention_type, shape_2d=shape_2d, share_pe_heads=share_pe_heads, pe_initialization=pe_initialization, cls_token=cls_token) else: self.scaled_dot_product_attention = BlockSparseProductAttention( name=self.name + '_block_sparse_attention', block_size=sparse_block_size, sparse_pattern=sparse_pattern, apply_softmax=apply_softmax) if q_initializer == 'uniform': q_initializer = tfki.GlorotUniform() elif q_initializer == 'zeros': q_initializer = tfki.Zeros() elif q_initializer == 'identity': q_initializer = tfki.Identity() if k_initializer == 'uniform': k_initializer = tfki.GlorotUniform() elif k_initializer == 'zeros': k_initializer = tfki.Zeros() elif k_initializer == 'identity': k_initializer = tfki.Identity() if v_initializer == 'uniform': v_initializer = tfki.GlorotUniform() elif v_initializer == 'zeros': v_initializer = tfki.Zeros() elif v_initializer == 'identity': v_initializer = tfki.Identity() if o_initializer == 'uniform': o_initializer = tfki.GlorotUniform() elif o_initializer == 'zeros': o_initializer = tfki.Zeros() elif o_initializer == 'identity': o_initializer = tfki.Identity() self.wq = tf.keras.layers.Dense(d_proj * n_heads, use_bias=use_bias, name=self.name + '_wq', kernel_initializer=InitializerScaler( q_initializer, qkvw_init_scale[0]), trainable=trainable) self.wk = tf.keras.layers.Dense(d_proj * n_heads, use_bias=use_bias, name=self.name + '_wk', kernel_initializer=InitializerScaler( k_initializer, qkvw_init_scale[1]), trainable=trainable) self.wv = tf.keras.layers.Dense(d_proj * n_heads, use_bias=use_bias, name=self.name + '_wv', kernel_initializer=InitializerScaler( v_initializer, qkvw_init_scale[2]), trainable=trainable) self.wo = tf.keras.layers.Dense(d_model, use_bias=use_bias, name=self.name + '_wo', kernel_initializer=InitializerScaler( o_initializer, qkvw_init_scale[3]), trainable=trainable)
def __init__(self, p_fun, loss_weights=(0.5, 0.5), n_features=1, n_labels=1, hidden_layers=None, metric='mae', initializer=None, optimizer=None, learning_rate=0.01, history=None, kernel_reg_rate=0.0, kernel_reg_power=1, bias_reg_rate=0.0, bias_reg_power=1, feature_names=None, output_names=None): """ Parameters ---------- p_fun : function Physics function to guide the neural network loss function. This function must take (y_predicted, y_true, p, **p_kwargs) as arguments with datatypes (tf.Tensor, np.ndarray, np.ndarray). The function must return a tf.Tensor object with a single numeric loss value (output.ndim == 0). loss_weights : tuple, optional Loss weights for the neural network y_predicted vs. y_true and for the p_fun loss, respectively. For example, loss_weights=(0.0, 1.0) would simplify the phygnn loss function to just the p_fun output. n_features : int, optional Number of input features. n_labels : int, optional Number of output labels. hidden_layers : list, optional List of dictionaries of key word arguments for each hidden layer in the NN. Dense linear layers can be input with their activations or separately for more explicit control over the layer ordering. For example, this is a valid input for hidden_layers that will yield 7 hidden layers (9 layers total): [{'units': 64, 'activation': 'relu', 'dropout': 0.01}, {'units': 64}, {'batch_normalization': {'axis': -1}}, {'activation': 'relu'}, {'dropout': 0.01}] metric : str, optional Loss metric option for the NN loss function (not the physical loss function). Must be a valid key in phygnn.loss_metrics.METRICS initializer : tensorflow.keras.initializers, optional Instantiated initializer object. None defaults to GlorotUniform optimizer : tensorflow.keras.optimizers, optional Instantiated neural network optimization object. None defaults to Adam. learning_rate : float, optional Optimizer learning rate. history : None | pd.DataFrame, optional Learning history if continuing a training session. kernel_reg_rate : float, optional Kernel regularization rate. Increasing this value above zero will add a structural loss term to the loss function that disincentivizes large hidden layer weights and should reduce model complexity. Setting this to 0.0 will disable kernel regularization. kernel_reg_power : int, optional Kernel regularization power. kernel_reg_power=1 is L1 regularization (lasso regression), and kernel_reg_power=2 is L2 regularization (ridge regression). bias_reg_rate : float, optional Bias regularization rate. Increasing this value above zero will add a structural loss term to the loss function that disincentivizes large hidden layer biases and should reduce model complexity. Setting this to 0.0 will disable bias regularization. bias_reg_power : int, optional Bias regularization power. bias_reg_power=1 is L1 regularization (lasso regression), and bias_reg_power=2 is L2 regularization (ridge regression). feature_names : list | tuple | None, optional Training feature names (strings). Mostly a convenience so that a loaded-from-disk model will have declared feature names, making it easier to feed in features for prediction. This will also get set if phygnn is trained on a DataFrame. output_names : list | tuple | None, optional Prediction output names (strings). Mostly a convenience so that a loaded-from-disk model will have declared output names, making it easier to understand prediction output. This will also get set if phygnn is trained on a DataFrame. """ self._p_fun = p_fun self._loss_weights = None self._metric = metric self._input_dims = n_features self._output_dims = n_labels self._layers = Layers(n_features, n_labels=n_labels, hidden_layers=hidden_layers) self._optimizer = None self._history = history self._learning_rate = learning_rate self.kernel_reg_rate = kernel_reg_rate self.kernel_reg_power = kernel_reg_power self.bias_reg_rate = bias_reg_rate self.bias_reg_power = bias_reg_power self.feature_names = feature_names self.output_names = output_names self.set_loss_weights(loss_weights) if self._metric.lower() not in METRICS: e = ('Could not recognize error metric "{}". The following error ' 'metrics are available: {}'.format(self._metric, list(METRICS.keys()))) logger.error(e) raise KeyError(e) else: self._metric_fun = METRICS[self._metric.lower()] self._initializer = initializer if initializer is None: self._initializer = initializers.GlorotUniform() self._optimizer = optimizer if optimizer is None: self._optimizer = optimizers.Adam(learning_rate=learning_rate)
import tensorflow as tf import tensorflow.keras.layers as kl import tensorflow.keras.initializers as inits import numpy as np from tensorflow.keras.regularizers import l2 initialize_relu = inits.VarianceScaling(scale=1./3., mode="fan_in", distribution="uniform") # this conserves std for layers with relu activation initialize_tanh = inits.GlorotUniform() # This is the standard tf.keras.layers.Dense initializer, it conserves std for layers with tanh activation class Actor(tf.keras.Model): def __init__(self, state_dim, action_dim, max_action, ac_layers, reg_coeff): super(Actor, self).__init__() self.l1 = kl.Dense(ac_layers[0], activation='relu', kernel_initializer=initialize_relu, kernel_regularizer=l2(reg_coeff)) self.l2 = kl.Dense(ac_layers[1], activation='relu', kernel_initializer=initialize_relu, kernel_regularizer=l2(reg_coeff)) self.l3 = kl.Dense(action_dim, activation='tanh', kernel_initializer=initialize_tanh, kernel_regularizer=l2(reg_coeff)) self._max_action = max_action # Remember building your model before you can copy it # else the weights wont be there. Could also call the model once in the beginning to build it implicitly self.build(input_shape=(None, state_dim)) #@tf.function def call(self, state): assert state.dtype == tf.float32 x = self.l1(state) x = self.l2(x) return self._max_action * self.l3(x)
def __init__(self, p_fun, loss_weights=(0.5, 0.5), n_features=1, n_labels=1, hidden_layers=None, input_layer=None, output_layer=None, layers_obj=None, metric='mae', initializer=None, optimizer=None, learning_rate=0.01, history=None, kernel_reg_rate=0.0, kernel_reg_power=1, bias_reg_rate=0.0, bias_reg_power=1, feature_names=None, output_names=None, name=None, version_record=None): """ Parameters ---------- p_fun : function Physics function to guide the neural network loss function. This fun must take (phygnn, y_true, y_predicted, p, **p_kwargs) as arguments with datatypes (PhysicsGuidedNeuralNetwork, tf.Tensor, np.ndarray, np.ndarray). The function must return a tf.Tensor object with a single numeric loss value (output.ndim == 0). loss_weights : tuple, optional Loss weights for the neural network y_true vs. y_predicted and for the p_fun loss, respectively. For example, loss_weights=(0.0, 1.0) would simplify the phygnn loss function to just the p_fun output. n_features : int, optional Number of input features. This should match the last dimension of the feature training data. n_labels : int, optional Number of output labels. This should match the last dimension of the label training data. hidden_layers : list, optional List of dictionaries of key word arguments for each hidden layer in the NN. Dense linear layers can be input with their activations or separately for more explicit control over the layer ordering. For example, this is a valid input for hidden_layers that will yield 8 hidden layers (10 layers including input+output): [{'units': 64, 'activation': 'relu', 'dropout': 0.01}, {'units': 64}, {'batch_normalization': {'axis': -1}}, {'activation': 'relu'}, {'dropout': 0.01}, {'class': 'Flatten'}, ] input_layer : None | bool | dict Input layer. specification. Can be a dictionary similar to hidden_layers specifying a dense / conv / lstm layer. Will default to a keras InputLayer with input shape = n_features. Can be False if the input layer will be included in the hidden_layers input. output_layer : None | bool | list | dict Output layer specification. Can be a list/dict similar to hidden_layers input specifying a dense layer with activation. For example, for a classfication problem with a single output, output_layer should be [{'units': 1}, {'activation': 'sigmoid'}]. This defaults to a single dense layer with no activation (best for regression problems). Can be False if the output layer will be included in the hidden_layers input. layers_obj : None | phygnn.utilities.tf_layers.Layers Optional initialized Layers object to set as the model layers including pre-set weights. This option will override the hidden_layers, input_layer, and output_layer arguments. metric : str, optional Loss metric option for the NN loss function (not the physical loss function). Must be a valid key in phygnn.loss_metrics.METRICS or a method in tensorflow.keras.losses that takes (y_true, y_predicted) as arguments. initializer : tensorflow.keras.initializers, optional Instantiated initializer object. None defaults to GlorotUniform optimizer : tensorflow.keras.optimizers | dict | None Instantiated tf.keras.optimizers object or a dict optimizer config from tf.keras.optimizers.get_config(). None defaults to Adam. learning_rate : float, optional Optimizer learning rate. Not used if optimizer input arg is a pre-initialized object or if optimizer input arg is a config dict. history : None | pd.DataFrame, optional Learning history if continuing a training session. kernel_reg_rate : float, optional Kernel regularization rate. Increasing this value above zero will add a structural loss term to the loss function that disincentivizes large hidden layer weights and should reduce model complexity. Setting this to 0.0 will disable kernel regularization. kernel_reg_power : int, optional Kernel regularization power. kernel_reg_power=1 is L1 regularization (lasso regression), and kernel_reg_power=2 is L2 regularization (ridge regression). bias_reg_rate : float, optional Bias regularization rate. Increasing this value above zero will add a structural loss term to the loss function that disincentivizes large hidden layer biases and should reduce model complexity. Setting this to 0.0 will disable bias regularization. bias_reg_power : int, optional Bias regularization power. bias_reg_power=1 is L1 regularization (lasso regression), and bias_reg_power=2 is L2 regularization (ridge regression). feature_names : list | tuple | None, optional Training feature names (strings). Mostly a convenience so that a loaded-from-disk model will have declared feature names, making it easier to feed in features for prediction. This will also get set if phygnn is trained on a DataFrame. output_names : list | tuple | None, optional Prediction output names (strings). Mostly a convenience so that a loaded-from-disk model will have declared output names, making it easier to understand prediction output. This will also get set if phygnn is trained on a DataFrame. name : None | str Optional model name for debugging. version_record : dict | None Optional record of import package versions. None (default) will save active environment versions. A dictionary will be interpreted as versions from a loaded model and will be saved as an attribute. """ super().__init__(n_features=n_features, n_labels=n_labels, hidden_layers=hidden_layers, input_layer=input_layer, output_layer=output_layer, layers_obj=layers_obj, feature_names=feature_names, output_names=output_names, version_record=version_record) self._p_fun = p_fun if p_fun is not None else self.p_fun_dummy self._loss_weights = None self._metric = metric self._optimizer = None self._history = history self._learning_rate = learning_rate self.kernel_reg_rate = kernel_reg_rate self.kernel_reg_power = kernel_reg_power self.bias_reg_rate = bias_reg_rate self.bias_reg_power = bias_reg_power self.name = name if isinstance(name, str) else 'phygnn' self.set_loss_weights(loss_weights) if self._metric.lower() in METRICS: self._metric_fun = METRICS[self._metric.lower()] else: try: self._metric_fun = getattr(tf.keras.losses, self._metric) except Exception as e: msg = ('Could not recognize error metric "{}". The following ' 'error metrics are available: {}'.format( self._metric, list(METRICS.keys()))) logger.error(msg) raise KeyError(msg) from e self._initializer = initializer if initializer is None: self._initializer = initializers.GlorotUniform() self._optimizer = optimizer if isinstance(optimizer, dict): class_name = optimizer['name'] OptimizerClass = getattr(optimizers, class_name) self._optimizer = OptimizerClass.from_config(optimizer) elif optimizer is None: self._optimizer = optimizers.Adam(learning_rate=learning_rate)
class WindowAttention(layers.Layer): r""" Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True attn_drop_ratio (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop_ratio (float, optional): Dropout ratio of output. Default: 0.0 """ k_ini = initializers.GlorotUniform() b_ini = initializers.Zeros() def __init__(self, dim, window_size, num_heads=8, qkv_bias=False, attn_drop_ratio=0., proj_drop_ratio=0., name=None): super(WindowAttention, self).__init__(name=name) self.dim = dim self.window_size = window_size # [Mh, Mw] self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias, name="qkv", kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.attn_drop = layers.Dropout(attn_drop_ratio) self.proj = layers.Dense(dim, name="proj", kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.proj_drop = layers.Dropout(proj_drop_ratio) def build(self, input_shape): # define a parameter table of relative position bias # [2*Mh-1 * 2*Mw-1, nH] self.relative_position_bias_table = self.add_weight( shape=[ (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), self.num_heads ], initializer=initializers.TruncatedNormal(stddev=0.02), trainable=True, dtype=tf.float32, name="relative_position_bias_table") coords_h = np.arange(self.window_size[0]) coords_w = np.arange(self.window_size[1]) coords = np.stack(np.meshgrid(coords_h, coords_w, indexing="ij")) # [2, Mh, Mw] coords_flatten = np.reshape(coords, [2, -1]) # [2, Mh*Mw] # [2, Mh*Mw, 1] - [2, 1, Mh*Mw] relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # [2, Mh*Mw, Mh*Mw] relative_coords = np.transpose(relative_coords, [1, 2, 0]) # [Mh*Mw, Mh*Mw, 2] relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # [Mh*Mw, Mh*Mw] self.relative_position_index = tf.Variable( tf.convert_to_tensor(relative_position_index), trainable=False, dtype=tf.int64, name="relative_position_index") def call(self, x, mask=None, training=None): """ Args: x: input features with shape of (num_windows*B, Mh*Mw, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None training: whether training mode """ # [batch_size*num_windows, Mh*Mw, total_embed_dim] B_, N, C = x.shape # qkv(): -> [batch_size*num_windows, Mh*Mw, 3 * total_embed_dim] qkv = self.qkv(x) # reshape: -> [batch_size*num_windows, Mh*Mw, 3, num_heads, embed_dim_per_head] qkv = tf.reshape(qkv, [B_, N, 3, self.num_heads, C // self.num_heads]) # transpose: -> [3, batch_size*num_windows, num_heads, Mh*Mw, embed_dim_per_head] qkv = tf.transpose(qkv, [2, 0, 3, 1, 4]) # [batch_size*num_windows, num_heads, Mh*Mw, embed_dim_per_head] q, k, v = qkv[0], qkv[1], qkv[2] # transpose: -> [batch_size*num_windows, num_heads, embed_dim_per_head, Mh*Mw] # multiply -> [batch_size*num_windows, num_heads, Mh*Mw, Mh*Mw] attn = tf.matmul(a=q, b=k, transpose_b=True) * self.scale # relative_position_bias(reshape): [Mh*Mw*Mh*Mw,nH] -> [Mh*Mw,Mh*Mw,nH] relative_position_bias = tf.gather( self.relative_position_bias_table, tf.reshape(self.relative_position_index, [-1])) relative_position_bias = tf.reshape(relative_position_bias, [ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 ]) relative_position_bias = tf.transpose(relative_position_bias, [2, 0, 1]) # [nH, Mh*Mw, Mh*Mw] attn = attn + tf.expand_dims(relative_position_bias, 0) if mask is not None: # mask: [nW, Mh*Mw, Mh*Mw] nW = mask.shape[0] # num_windows # attn(reshape): [batch_size, num_windows, num_heads, Mh*Mw, Mh*Mw] # mask(expand_dim): [1, nW, 1, Mh*Mw, Mh*Mw] attn = tf.reshape( attn, [B_ // nW, nW, self.num_heads, N, N]) + tf.expand_dims( tf.expand_dims(mask, 1), 0) attn = tf.reshape(attn, [-1, self.num_heads, N, N]) attn = tf.nn.softmax(attn, axis=-1) attn = self.attn_drop(attn, training=training) # multiply -> [batch_size*num_windows, num_heads, Mh*Mw, embed_dim_per_head] x = tf.matmul(attn, v) # transpose: -> [batch_size*num_windows, Mh*Mw, num_heads, embed_dim_per_head] x = tf.transpose(x, [0, 2, 1, 3]) # reshape: -> [batch_size*num_windows, Mh*Mw, total_embed_dim] x = tf.reshape(x, [B_, N, C]) x = self.proj(x) x = self.proj_drop(x, training=training) return x
def __init__(self, return_attention=False, **kwargs): self.init = initializers.GlorotUniform(seed=101) self.supports_masking = True self.return_attention = return_attention super(AttentionWeightedAverage, self).__init__(**kwargs)