def init_wrap(init: dy.PyInitializer, size: Tuple[int]) -> dy.PyInitializer: if init == OrthogonalInitializer: return dy.NumpyInitializer(init.init(size)) elif isinstance(init, dy.PyInitializer) == True: return init else: raise RuntimeError('%s is not a instance of dy.PyInitializer.' % init)
def __init__(self, model, input_dim,output_dim): self.input = input_dim self.output = output_dim Saxe_initializer = Saxe.Orthogonal() self.W = model.add_parameters((self.output,self.input), init=dy.NumpyInitializer(Saxe_initializer(((self.output,self.input))))) self.b = model.add_parameters((self.output), init = dy.ConstInitializer(0))
def initializer(self, dim: Tuple[numbers.Integral], is_lookup: bool = False, num_shared: numbers.Integral = 1) -> 'dy.NumpyInitializer': if dim != self.array.shape: raise ValueError( f"the passed initializer array has different dimensions than the parameters to be initialized: : {self.array.shape} != {dim}" ) return dy.NumpyInitializer(array=self.array)
def __init__(self, model, n_labels, src_ctx_dim=400, hidden=400, dropout=0.33): self.src_ctx_dim = src_ctx_dim self.dropout = dropout self.n_labels = n_labels self.hidden = hidden self.dist_max = 10 self.dist_dims = 32 self.dlookup = model.add_lookup_parameters( (self.dist_max * 2, self.dist_dims), init=dy.ConstInitializer(0)) Saxe_initializer = Saxe.Orthogonal(gain='leaky_relu', alpha=0.1) self.W_head = model.add_parameters( (self.src_ctx_dim, self.src_ctx_dim), init=dy.NumpyInitializer( Saxe_initializer(((self.src_ctx_dim, self.src_ctx_dim))))) self.b_head = model.add_parameters((self.src_ctx_dim), init=dy.ConstInitializer(0)) self.W_mod = model.add_parameters( (self.src_ctx_dim, self.src_ctx_dim), init=dy.NumpyInitializer( Saxe_initializer(((self.src_ctx_dim, self.src_ctx_dim))))) self.b_mod = model.add_parameters((self.src_ctx_dim), init=dy.ConstInitializer(0)) self.W_arc1 = model.add_parameters( (self.hidden, self.src_ctx_dim + self.dist_dims), init=dy.NumpyInitializer( Saxe_initializer( ((self.hidden, self.src_ctx_dim + self.dist_dims))))) self.b_arc1 = model.add_parameters((self.hidden), init=dy.ConstInitializer(0)) self.W_arc2 = model.add_parameters( (self.n_labels, self.hidden), init=dy.NumpyInitializer( Saxe_initializer(((self.n_labels, self.hidden))))) self.b_arc2 = model.add_parameters((self.n_labels), init=dy.ConstInitializer(0))
def __init__(self, model, num_tasks, hidden_dim, num_subspaces=1, init_scheme=BALANCED): """ Initializes a CrossStitchLayer. :param model: the DyNet Model :param num_tasks: the number of tasks :param hidden_dim: the # of hidden dimensions of the previous LSTM layer :param num_subspaces: the number of subspaces :param init_scheme: the initialization scheme; balanced or imbalanced """ print('Using %d subspaces...' % num_subspaces, flush=True) alpha_params = np.full( (num_tasks * num_subspaces, num_tasks * num_subspaces), 1. / (num_tasks * num_subspaces)) if init_scheme == IMBALANCED: if num_subspaces == 1: alpha_params = np.full((num_tasks, num_tasks), 0.1 / (num_tasks - 1)) for i in range(num_tasks): alpha_params[i, i] = 0.9 else: # 0 1 0 1 # 0 1 0 1 # 1 0 1 0 # 1 0 1 0 for (x, y), value in np.ndenumerate(alpha_params): if (y + 1) % num_subspaces == 0 and not \ (x in range(num_tasks, num_tasks+num_subspaces)): alpha_params[x, y] = 0.95 elif (y + num_subspaces) % num_subspaces == 0 and x \ in range(num_tasks, num_tasks+num_subspaces): alpha_params[x, y] = 0.95 else: alpha_params[x, y] = 0.05 self.alphas = model.add_parameters( (num_tasks * num_subspaces, num_tasks * num_subspaces), init=dynet.NumpyInitializer(alpha_params)) #print('Initializing cross-stitch units to:', flush=True) #print(dynet.parameter(self.alphas).value(), flush=True) self.num_tasks = num_tasks self.num_subspaces = num_subspaces self.hidden_dim = hidden_dim
def __init__(self, model, num_layers, hidden_dim, init_scheme=IMBALANCED): """ Initializes a LayerStitchLayer. :param model: the DyNet model :param num_layers: the number of layers :param hidden_dim: the hidden dimensions of the LSTM layers :param init_scheme: the initialisation scheme; balanced or imbalanced """ if init_scheme == IMBALANCED: beta_params = np.full((num_layers), 0.1 / (num_layers - 1)) beta_params[-1] = 0.9 elif init_scheme == BALANCED: beta_params = np.full((num_layers), 1. / num_layers) else: raise ValueError('Invalid initialization scheme for layer-stitch ' 'units: %s.' % init_scheme) self.betas = model.add_parameters( num_layers, init=dynet.NumpyInitializer(beta_params)) print('Initializing layer-stitch units to:', flush=True) print(dynet.parameter(self.betas).value(), flush=True) self.num_layers = num_layers self.hidden_dim = hidden_dim
def __init__(self, model, input_size, recur_size, forget_bias=0.0): self.input_size = input_size self.recur_size = recur_size self.input_drop_mask = dy.ones(self.input_size) self.recur_drop_mask = dy.ones(self.recur_size) self.forget_bias = forget_bias self.cell_previous = None self.hidden_previous = None self.init = False self.input_drop = 0 self.recur_drop = 0 Saxe_initializer = Saxe.Orthogonal() gates_init = Saxe_initializer( ((self.recur_size, self.input_size + self.recur_size))) gates_init = np.concatenate([gates_init] * 4) self.WXH = model.add_parameters( (self.recur_size * 4, self.input_size + self.recur_size), init=dy.NumpyInitializer(gates_init)) self.b = model.add_parameters((self.recur_size * 4), init=dy.ConstInitializer(0))
def build(self, char_dim, char_lstm_dim, ch_b, mt_d, word_dim, word_lstm_dim, w_b, lr_method, pre_emb, cap_dim, training=True, **kwargs): """ Build the network. """ self.training = training def _create_get_representation(activation_function=lambda x: x): """ Helper function to create a function which assembles a representation given an activation_function :param activation_function: :return: """ def f(obj, es): representations = [] # for e in es: # dynet.ensure_freshness(e) for (fb, bb) in obj.builder_layers: fs = fb.initial_state().transduce(es) bs = bb.initial_state().transduce(reversed(es)) es = [ dynet.concatenate([f, b]) for f, b in zip(fs, reversed(bs)) ] representations.append( activation_function(dynet.concatenate([fs[-1], bs[-1]]))) return representations return f BiRNNBuilder.get_representation = _create_get_representation( activation_function=dynet.rectify) BiRNNBuilder.get_representation_concat = _create_get_representation() # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) n_morpho_tags = len(self.id_to_morpho_tag) # Number of capitalization features if cap_dim: n_cap = 17 # Final input (all word features) word_representation_dim = 0 def get_scale(shape): return np.sqrt(6 / np.sum(list(shape))) # # Word inputs # if word_dim: # Initialize with pretrained embeddings scale = get_scale((n_words, word_dim)) new_weights = scale * np.random.uniform(-1.0, 1.0, (n_words, word_dim)) # new_weights = np.zeros([n_words, word_dim], dtype='float32') if pre_emb and training: print('Loading pretrained embeddings from %s...' % pre_emb) pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print('WARNING: %i invalid lines' % emb_invalid) c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in range(n_words): raw_word = self.id_to_word[i] if raw_word != "<UNK>": # word = raw_word.split(" ")[1] word = raw_word else: word = raw_word # print word if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 print('Loaded %i pretrained embeddings.' % len(pretrained)) print(('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % (c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words)) print(('%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros)) word_representation_dim += word_dim self.word_embeddings = self.model.add_lookup_parameters( (n_words, word_dim), init=dynet.NumpyInitializer(new_weights), name="wordembeddings") def create_bilstm_layer(label, input_dim, lstm_dim, bilstm=True): if bilstm: builder = BiRNNBuilder(1, input_dim, lstm_dim, self.model, CoupledLSTMBuilder) else: builder = CoupledLSTMBuilder(1, input_dim, lstm_dim, self.model) return builder # Chars inputs # if char_dim: self.char_embeddings = self.model.add_lookup_parameters( (n_chars, char_dim), name="charembeddings") self.char_lstm_layer = create_bilstm_layer( "char", char_dim, (2 if ch_b else 1) * char_lstm_dim, bilstm=True if ch_b else False) word_representation_dim += (2 if ch_b else 1) * char_lstm_dim # if self.parameters['integration_mode'] in [1, 2] or self.parameters['active_models'] in [1, # 2, # 3]: if self.parameters['active_models'] in [1, 2, 3]: self.char_lstm_layer_for_morph_analysis_roots = \ create_bilstm_layer("char_for_morph_analysis_root", char_dim, 2 * mt_d, bilstm=True) self.morpho_tag_embeddings = self.model.add_lookup_parameters( (n_morpho_tags, mt_d), name="charembeddings") self.morpho_tag_lstm_layer_for_morph_analysis_tags = \ create_bilstm_layer("morpho_tag_for_morph_analysis_tags", mt_d, 2 * mt_d, bilstm=True) if self.parameters[ 'use_golden_morpho_analysis_in_word_representation']: assert self.parameters['integration_mode'] == 0 and \ self.parameters['active_models'] == 0, "This feature is meaningful if we solely aim NER task." self.morpho_tag_embeddings = self.model.add_lookup_parameters( (n_morpho_tags, mt_d), name="charembeddings") self.old_style_morpho_tag_lstm_layer_for_golden_morpho_analyzes = \ create_bilstm_layer("old_style_morpho_tag_lstm_layer_for_golden_morpho_analyzes", mt_d, 2 * mt_d, bilstm=True) word_representation_dim += 2 * mt_d # # Capitalization feature # if cap_dim: word_representation_dim += cap_dim self.cap_embeddings = self.model.add_lookup_parameters( (n_cap, cap_dim), name="capembeddings") if self.parameters['multilayer'] and self.parameters[ 'shortcut_connections']: shortcut_connection_addition = word_representation_dim self.sentence_level_bilstm_contexts_length = shortcut_connection_addition + 2 * word_lstm_dim else: self.sentence_level_bilstm_contexts_length = 2 * word_lstm_dim # else: # self.sentence_level_bilstm_contexts_length = word_lstm_dim # TODO: Q: as the output of self.tanh_layer_W will be used. right? self.tanh_layer_W = self.model.add_parameters( (word_lstm_dim, self.sentence_level_bilstm_contexts_length)) self.tanh_layer_b = self.model.add_parameters((word_lstm_dim)) if self.parameters['integration_mode'] in [0, 1]: self.last_layer_W = self.model.add_parameters( (n_tags, word_lstm_dim)) elif self.parameters['integration_mode'] == 2: self.last_layer_W = self.model.add_parameters( (n_tags, word_lstm_dim + 2 * mt_d)) self.last_layer_b = self.model.add_parameters((n_tags)) self.transform_context_layer_b = \ self.model.add_parameters((2 * mt_d)) self.transform_context_layer_W = \ self.model.add_parameters((2 * mt_d, self.sentence_level_bilstm_contexts_length)) # LSTM for words # self.sentence_level_bilstm_layer = \ # create_bilstm_layer("sentence_level", # word_representation_dim, # 2 * word_lstm_dim, # bilstm=True if w_b else False) from toolkit.rnn import BiLSTMMultiLayeredWithShortcutConnections if self.parameters['multilayer']: self.num_sentence_level_bilstm_layers = 3 else: self.num_sentence_level_bilstm_layers = 1 self.sentence_level_bilstm_layer = \ BiLSTMMultiLayeredWithShortcutConnections(self.num_sentence_level_bilstm_layers, word_representation_dim, 2 * word_lstm_dim, self.model, CoupledLSTMBuilder, self.parameters['shortcut_connections']) def _create_tying_method(activation_function=dynet.tanh, classic=True): def f(x, y): if classic: return dynet.tanh(x + y) else: return activation_function(self.tying_method_W * dynet.concatenate([x, y]) + self.tying_method_b) return f if self.parameters['tying_method']: self.tying_method_W = self.model.add_parameters( (word_lstm_dim, 2 * mt_d)) self.tying_method_b = self.model.add_parameters((word_lstm_dim)) self.f_tying_method = _create_tying_method( activation_function=dynet.tanh, classic=False) else: self.f_tying_method = _create_tying_method( activation_function=dynet.tanh, classic=True) self.crf_module = CRF(self.model, self.id_to_tag) # Training def process_hyperparameter_definition(x): tokens = x.split("@") subtokens = tokens[0].split("_") if len(subtokens) > 1 and subtokens[-1] == "float": return ["_".join(subtokens[:-1]), float(tokens[1])] else: return tokens _tokens = lr_method.split("-") opt_update_algorithm = _tokens[0] opt_hyperparameters = [ process_hyperparameter_definition(x) for x in _tokens[1:] ] opt_update_algorithms = { 'sgd': dynet.SimpleSGDTrainer, 'adam': dynet.AdamTrainer, 'adadelta': dynet.AdadeltaTrainer, 'adagrad': dynet.AdagradTrainer, 'momentum': dynet.MomentumSGDTrainer, 'rmsprop': dynet.RMSPropTrainer } if opt_update_algorithm == "adam": opt_hyperparameters += [("sparse_updates_enabled", self.parameters['sparse_updates_enabled']) ] self.trainer = opt_update_algorithms[opt_update_algorithm]( self.model, # sparse_updates_enabled=self.parameters['sparse_updates_enabled'], **{name: value for name, value in opt_hyperparameters}) # self.trainer = dynet.SimpleSGDTrainer(self.model, learning_rate=0.01) self.saver = DynetSaver(self.model, self.model_path) return self
def initializer(self, dim, is_lookup: bool = False, num_shared: numbers.Integral = 1) -> dy.NumpyInitializer: return dy.NumpyInitializer(array=self.array)
def __init__(self, h_dim, h_layers, model_dir, log_dir, task_names, languages, embeds=None, activation=dynet.tanh, lower=False, noise_sigma=0.1, cross_stitch=False, num_subspaces=1, constraint_weight=0, constrain_matrices=[1, 2], cross_stitch_init_scheme=IMBALANCED, layer_stitch_init_scheme=BALANCED, best_train_dict={}, best_dev_dict={}, avg_train_score=0, avg_dev_score=0, best_epoch=-1, word2id={}, oov_id=None): """ :param h_dim: The hidden dimension of the model. :param h_layers: The number of hidden layers. :param model_dir: The directory where the model should be saved :param log_dir: The directory where the log should be saved :param task_names: the names of the tasks :param langauges: the training languages of the model :param embeds: the pre-trained embedding used by the model :param activation: the DyNet activation function that should be used :param lower: whether the words should be lower-cased :param noise_sigma: the stddev of the Gaussian noise that should be used during training if > 0.0 :param cross_stitch: whether to use cross-stitch units :param num_subspaces: the number of subspaces to use (1 or 2) :param constraint_weight: weight of subspace orthogonality constraint (default: 0 = no constraint) :param constrain_matrices: indices of LSTM weight matrices that should be constrained (default: [1, 2]) :param cross_stitch_init_scheme: initialisation scheme for cross-stitch :param layer_stitch_init_scheme: initialisation scheme for layer-stitch :param best_train_dict: dictionary storing the best scores on training set :param best_dev_dict: dictionary storing the best scores on development set :param avg_train_score: best unweighted average training score over all tasks and all metrics :param avg_dev_score: best unweighted average development score over all tasks and all metrics :param best_epoch: the epoch of the best performance :param word2id: dictionary storing the words to the idx of the word embedding :param oov_id: the idx of the word which do not appear in the pre-trained word embedding """ self.word2id = word2id self.task_names = task_names self.model_dir = model_dir self.log_dir = log_dir self.w_in_dim = 0 if (len(task_names) == 1): if (len(languages) == 1): self.model_file = os.path.join( model_dir, 'STSL/{}_{}.model'.format(languages[0], task_names[0])) self.params_file = os.path.join( model_dir, 'STSL/{}_{}.pkl'.format(languages[0], task_names[0])) else: self.model_file = os.path.join( model_dir, 'STML/{}.model'.format(task_names[0])) self.params_file = os.path.join( model_dir, 'STML/{}.pkl'.format(task_names[0])) else: if (len(languages) == 1): self.model_file = os.path.join( model_dir, 'MTSL/{}.model'.format(languages[0])) self.params_file = os.path.join( model_dir, 'MTSL/{}.pkl'.format(languages[0])) else: self.model_file = os.path.join(model_dir, 'MTML/MTML.model') self.params_file = os.path.join(model_dir, 'MTML/MTML.pkl') self.cross_stitch = cross_stitch self.num_subspaces = num_subspaces self.constraint_weight = constraint_weight self.constrain_matrices = constrain_matrices self.cross_stitch_init_scheme = cross_stitch_init_scheme self.layer_stitch_init_scheme = layer_stitch_init_scheme self.model = dynet.Model() # init model # term to capture sum of constraints over all subspaces self.subspace_penalty = self.model.add_parameters( 1, init=dynet.NumpyInitializer(np.zeros(1))) # weight of subspace constraint self.constraint_weight_param = self.model.add_parameters( 1, init=dynet.NumpyInitializer(np.array(self.constraint_weight))) task2label2id = {} for task in task_names: labels = LABELS[task] task2label2id[task] = {} count = 0 for label in LABELS[task]: task2label2id[task][label] = count count += 1 self.task2label2id = task2label2id # need one dictionary per task self.languages = languages self.h_dim = h_dim self.activation = activation self.lower = lower self.noise_sigma = noise_sigma self.h_layers = h_layers self.predictors = {} self.wembeds = None # lookup: embeddings for words self.embeds = embeds self.best_train_dict = best_train_dict self.best_dev_dict = best_dev_dict self.best_epoch = best_epoch self.avg_train_score = avg_train_score self.avg_dev_score = avg_dev_score self.oov_id = oov_id
def initializer(self, dim, is_lookup=False, num_shared=1): return dy.NumpyInitializer(array=self.array)
def __init__(self, in_dim, h_dim, c_in_dim, h_layers, pred_layer, model_dir, embeds_file=None, activation=dynet.tanh, lower=False, noise_sigma=0.1, task_names=[], cross_stitch=False, layer_connect=NONE, num_subspaces=1, constraint_weight=0, constrain_matrices=[1, 2], cross_stitch_init_scheme=IMBALANCED, layer_stitch_init_scheme=BALANCED): """ :param in_dim: The dimension of the word embeddings. :param h_dim: The hidden dimension of the model. :param c_in_dim: The dimension of the character embeddings. :param h_layers: The number of hidden layers. :param pred_layer: Indices indicating at which layer to predict each task, e.g. [1, 2] indicates 1st task is predicted at 1st layer, 2nd task is predicted at 2nd layer :param model_dir: The directory where the model should be saved :param embeds_file: the file containing pre-trained word embeddings :param activation: the DyNet activation that should be used :param lower: whether the words should be lower-cased :param noise_sigma: the stddev of the Gaussian noise that should be used during training if > 0.0 :param task_names: the names of the tasks :param cross_stitch: whether to use cross-stitch units :param layer_connect: the layer connections that are used (stitch, skip, concat, or none) :param num_subspaces: the number of subspaces to use (1 or 2) :param constraint_weight: weight of subspace orthogonality constraint (default: 0 = no constraint) :param constrain_matrices: indices of LSTM weight matrices that should be constrained (default: [1, 2]) :param cross_stitch_init_scheme: initialisation scheme for cross-stitch :param layer_stitch_init_scheme: initialisation scheme for layer-stitch """ self.word2id = {} # word to index mapping self.char2id = {} # char to index mapping self.task_names = task_names self.main_task = self.task_names[0] print('Using the first task as main task:', self.main_task, flush=True) self.model_dir = model_dir self.model_file = os.path.join(model_dir, MODEL_FILE) self.params_file = os.path.join(model_dir, PARAMS_FILE) self.cross_stitch = cross_stitch self.layer_connect = layer_connect self.num_subspaces = num_subspaces self.constraint_weight = constraint_weight self.constrain_matrices = constrain_matrices self.cross_stitch_init_scheme = cross_stitch_init_scheme self.layer_stitch_init_scheme = layer_stitch_init_scheme self.model = dynet.Model() # init model # term to capture sum of constraints over all subspaces self.subspace_penalty = self.model.add_parameters( 1, init=dynet.NumpyInitializer(np.zeros(1))) # weight of subspace constraint self.constraint_weight_param = self.model.add_parameters( 1, init=dynet.NumpyInitializer(np.array(self.constraint_weight))) self.task2tag2idx = {} # need one dictionary per task self.pred_layer = pred_layer self.in_dim = in_dim self.h_dim = h_dim self.c_in_dim = c_in_dim self.activation = activation self.lower = lower self.noise_sigma = noise_sigma self.h_layers = h_layers # keep track of the inner layers and the task predictors self.predictors = { 'inner': [], 'output_layers_dict': {}, 'task_expected_at': {} } self.wembeds = None # lookup: embeddings for words self.cembeds = None # lookup: embeddings for characters self.embeds_file = embeds_file self.char_rnn = None # RNN for character input
def __init__(self, head_count: int, model_dim: int, downsample_factor: int = 1, input_dim: int = None, ignore_masks: bool = False, plot_attention: typing.Optional[str] = None, diag_gauss_mask: typing.Union[bool, numbers.Real] = False, square_mask_std: bool = True, cross_pos_encoding_type: typing.Optional[str] = None, kq_pos_encoding_type: typing.Optional[str] = None, kq_pos_encoding_size: int = 40, max_len: int = 1500, param_init: xnmt.param_initializers.ParamInitializer = xnmt. param_initializers.GlorotInitializer(), bias_init: xnmt.param_initializers.ParamInitializer = xnmt. param_initializers.ZeroInitializer(), linear_kvq=None, kq_positional_embedder=None, layer_norm=None, res_shortcut=None, desc: typing.Any = None) -> None: if input_dim is None: input_dim = model_dim self.input_dim = input_dim assert model_dim % head_count == 0 self.dim_per_head = model_dim // head_count self.model_dim = model_dim self.head_count = head_count assert downsample_factor >= 1 self.downsample_factor = downsample_factor self.plot_attention = plot_attention self.plot_attention_counter = 0 self.desc = desc self.ignore_masks = ignore_masks self.diag_gauss_mask = diag_gauss_mask self.square_mask_std = square_mask_std self.kq_pos_encoding_type = kq_pos_encoding_type self.kq_pos_encoding_size = kq_pos_encoding_size self.max_len = max_len subcol = param_collections.ParamManager.my_params(self) if self.kq_pos_encoding_type is None: self.linear_kvq = self.add_serializable_component( "linear_kvq", linear_kvq, lambda: transforms.Linear(input_dim * downsample_factor, head_count * self.dim_per_head * 3, param_init=param_init, bias_init=bias_init)) else: self.linear_kq, self.linear_v = \ self.add_serializable_component("linear_kvq", linear_kvq, lambda: [ transforms.Linear(input_dim * downsample_factor + self.kq_pos_encoding_size, head_count * self.dim_per_head * 2, param_init=param_init, bias_init=bias_init), transforms.Linear(input_dim * downsample_factor, head_count * self.dim_per_head, param_init=param_init, bias_init=bias_init)]) assert self.kq_pos_encoding_type == "embedding" self.kq_positional_embedder = self.add_serializable_component( "kq_positional_embedder", kq_positional_embedder, lambda: embedders.PositionEmbedder(max_pos=self.max_len, emb_dim=self.kq_pos_encoding_size, param_init=param_init)) if self.diag_gauss_mask: if self.diag_gauss_mask == "rand": rand_init = np.exp( (np.random.random(size=(self.head_count, ))) * math.log(1000)) self.diag_gauss_mask_sigma = subcol.add_parameters( dim=(1, 1, self.head_count), init=dy.NumpyInitializer(rand_init)) else: self.diag_gauss_mask_sigma = subcol.add_parameters( dim=(1, 1, self.head_count), init=dy.ConstInitializer(self.diag_gauss_mask)) self.layer_norm = self.add_serializable_component( "layer_norm", layer_norm, lambda: norms.LayerNorm(model_dim)) if model_dim != input_dim * downsample_factor: self.res_shortcut = self.add_serializable_component( "res_shortcut", res_shortcut, lambda: transforms.Linear(input_dim * downsample_factor, model_dim, param_init=param_init, bias_init=bias_init)) self.cross_pos_encoding_type = cross_pos_encoding_type if cross_pos_encoding_type == "embedding": self.cross_pos_emb_p1 = subcol.add_parameters( dim=(self.max_len, self.dim_per_head, self.head_count), init=dy.NormalInitializer(mean=1.0, var=0.001)) self.cross_pos_emb_p2 = subcol.add_parameters( dim=(self.max_len, self.dim_per_head, self.head_count), init=dy.NormalInitializer(mean=1.0, var=0.001)) elif cross_pos_encoding_type is not None: raise NotImplementedError()
def __init__(self, model, n_labels, src_ctx_dim=400, n_arc_mlp_units=400, n_label_mlp_units=100, arc_mlp_dropout=0.33, label_mlp_dropout=0.33): Saxe_initializer = Saxe.Orthogonal(gain='leaky_relu', alpha=0.1) self.src_ctx_dim = src_ctx_dim self.label_mlp_dropout = label_mlp_dropout self.arc_mlp_dropout = arc_mlp_dropout self.n_labels = n_labels self.W_arc_hidden_to_head = model.add_parameters( (n_arc_mlp_units, src_ctx_dim), init=dy.NumpyInitializer( Saxe_initializer(((n_arc_mlp_units, src_ctx_dim))))) self.b_arc_hidden_to_head = model.add_parameters( (n_arc_mlp_units, ), init=dy.ConstInitializer(0)) self.W_arc_hidden_to_dep = model.add_parameters( (n_arc_mlp_units, src_ctx_dim), init=dy.NumpyInitializer( Saxe_initializer(((n_arc_mlp_units, src_ctx_dim))))) self.b_arc_hidden_to_dep = model.add_parameters( (n_arc_mlp_units, ), init=dy.ConstInitializer(0)) self.W_label_hidden_to_head = model.add_parameters( (n_label_mlp_units, src_ctx_dim), init=dy.NumpyInitializer( Saxe_initializer(((n_label_mlp_units, src_ctx_dim))))) self.b_label_hidden_to_head = model.add_parameters( (n_label_mlp_units, ), init=dy.ConstInitializer(0)) self.W_label_hidden_to_dep = model.add_parameters( (n_label_mlp_units, src_ctx_dim), init=dy.NumpyInitializer( Saxe_initializer(((n_label_mlp_units, src_ctx_dim))))) self.b_label_hidden_to_dep = model.add_parameters( (n_label_mlp_units, ), init=dy.ConstInitializer(0)) self.U_arc_1 = model.add_parameters((n_arc_mlp_units, n_arc_mlp_units), init=dy.ConstInitializer(0)) self.u_arc_2 = model.add_parameters((n_arc_mlp_units), init=dy.ConstInitializer(0)) self.U_label_1 = [ model.add_parameters((n_label_mlp_units, n_label_mlp_units), init=dy.ConstInitializer(0)) for _ in range(n_labels) ] self.u_label_2_2 = [ model.add_parameters((1, n_label_mlp_units), init=dy.ConstInitializer(0)) for _ in range(n_labels) ] self.u_label_2_1 = [ model.add_parameters((n_label_mlp_units, 1), init=dy.ConstInitializer(0)) for _ in range(n_labels) ] self.b_label = [ model.add_parameters((1, ), init=dy.ConstInitializer(0)) for _ in range(n_labels) ]
def __init__(self, model, pos_labels, xpos_labels, src_ctx_dim=400, n_pos_tagger_mlp_units=200, n_xpos_tagger_mlp_units=200, mlps_dropout=0.33): self.src_ctx_dim = src_ctx_dim self.dropout = mlps_dropout self.pos_labels = pos_labels self.xpos_labels = xpos_labels Saxe_initializer = Saxe.Orthogonal(gain='leaky_relu',alpha = 0.1) self.W_pos = model.add_parameters((n_pos_tagger_mlp_units, src_ctx_dim), init=dy.NumpyInitializer(Saxe_initializer(((n_pos_tagger_mlp_units, src_ctx_dim))))) self.b_pos = model.add_parameters((n_pos_tagger_mlp_units,),init = dy.ConstInitializer(0)) self.W_xpos = model.add_parameters((n_xpos_tagger_mlp_units, src_ctx_dim), init=dy.NumpyInitializer(Saxe_initializer(((n_xpos_tagger_mlp_units, src_ctx_dim))))) self.b_xpos = model.add_parameters((n_xpos_tagger_mlp_units,),init = dy.ConstInitializer(0)) self.W_affine_pos = model.add_parameters((n_pos_tagger_mlp_units,pos_labels), init = dy.ConstInitializer(0)) self.b_affine_pos = model.add_parameters((pos_labels),init = dy.ConstInitializer(0)) self.W_affine_xpos = model.add_parameters((n_xpos_tagger_mlp_units,xpos_labels), init = dy.ConstInitializer(0)) self.b_affine_xpos = model.add_parameters((xpos_labels),init = dy.ConstInitializer(0))