def setUp(self): events.clear() ParamManager.init_param_col() # Load a pre-trained model load_experiment = LoadSerialized(filename=f"test/data/tiny_jaen.model", overwrite=[ { "path": "train", "val": None }, { "path": "status", "val": None }, ]) EXP_DIR = '.' EXP = "decode" uninitialized_experiment = YamlPreloader.preload_obj(load_experiment, exp_dir=EXP_DIR, exp_name=EXP) loaded_experiment = initialize_if_needed(uninitialized_experiment) ParamManager.populate() # Pull out the parts we need from the experiment self.model = loaded_experiment.model src_vocab = self.model.src_reader.vocab trg_vocab = self.model.trg_reader.vocab event_trigger.set_train(False) self.src_data = list( self.model.src_reader.read_sents("test/data/head.ja")) self.trg_data = list( self.model.trg_reader.read_sents("test/data/head.en"))
def setUp(self): layer_dim = 512 events.clear() ParamManager.init_param_col() src_vocab = Vocab(vocab_file="examples/data/head.ja.vocab") trg_vocab = Vocab(vocab_file="examples/data/head.en.vocab") self.model = DefaultTranslator( src_reader=PlainTextReader(vocab=src_vocab), trg_reader=PlainTextReader(vocab=trg_vocab), src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100), encoder=BiLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim), attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim), decoder=AutoRegressiveDecoder( input_dim=layer_dim, embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100), rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, decoder_input_dim=layer_dim, yaml_path="model.decoder.rnn"), transform=NonLinear(input_dim=layer_dim * 2, output_dim=layer_dim), scorer=Softmax(input_dim=layer_dim, vocab_size=100), bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)), ) event_trigger.set_train(False) self.src_data = list( self.model.src_reader.read_sents("examples/data/head.ja"))
def setUp(self): xnmt.events.clear() ParamManager.init_param_col() self.src_reader = PlainTextReader(vocab=Vocab(vocab_file="examples/data/head.ja.vocab")) self.trg_reader = PlainTextReader(vocab=Vocab(vocab_file="examples/data/head.en.vocab")) self.src_data = list(self.src_reader.read_sents("examples/data/head.ja")) self.trg_data = list(self.trg_reader.read_sents("examples/data/head.en"))
def setUp(self): # Seeding numpy.random.seed(2) random.seed(2) layer_dim = 32 xnmt.events.clear() ParamManager.init_param_col() edge_vocab = Vocab(vocab_file="examples/data/parse/head.en.edge_vocab") node_vocab = Vocab(vocab_file="examples/data/parse/head.en.node_vocab") value_vocab = Vocab(vocab_file="examples/data/head.en.vocab") self.src_reader = input_readers.PlainTextReader(vocab=value_vocab) self.trg_reader = input_readers.CoNLLToRNNGActionsReader( surface_vocab=value_vocab, nt_vocab=node_vocab, edg_vocab=edge_vocab) self.layer_dim = layer_dim self.src_data = list( self.src_reader.read_sents("examples/data/head.en")) self.trg_data = list( self.trg_reader.read_sents("examples/data/parse/head.en.conll")) self.loss_calculator = MLELoss() self.head_composer = composer.DyerHeadComposer( fwd_combinator=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim), bwd_combinator=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim), transform=AuxNonLinear(input_dim=layer_dim, aux_input_dim=layer_dim, output_dim=layer_dim)) self.model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=len(value_vocab)), encoder=IdentitySeqTransducer(), attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim), decoder=RNNGDecoder( input_dim=layer_dim, rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, decoder_input_dim=layer_dim), transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim, aux_input_dim=layer_dim), bridge=NoBridge(dec_dim=layer_dim, dec_layers=1), graph_reader=self.trg_reader, head_composer=self.head_composer)) event_trigger.set_train(True) my_batcher = batchers.TrgBatcher(batch_size=1) self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data) dy.renew_cg(immediate_compute=True, check_validity=True)
def setUp(self): # Seeding numpy.random.seed(2) random.seed(2) layer_dim = 4 xnmt.events.clear() ParamManager.init_param_col() self.segment_encoder_bilstm = BiLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim) self.segment_composer = SumComposer() self.src_reader = CharFromWordTextReader(vocab=Vocab(vocab_file="test/data/head.ja.charvocab")) self.trg_reader = PlainTextReader(vocab=Vocab(vocab_file="test/data/head.en.vocab")) self.loss_calculator = FeedbackLoss(child_loss=MLELoss(), repeat=5) baseline = Linear(input_dim=layer_dim, output_dim=1) policy_network = Linear(input_dim=layer_dim, output_dim=2) self.poisson_prior = PoissonPrior(mu=3.3) self.eps_greedy = EpsilonGreedy(eps_prob=0.0, prior=self.poisson_prior) self.conf_penalty = ConfidencePenalty() self.policy_gradient = PolicyGradient(input_dim=layer_dim, output_dim=2, baseline=baseline, policy_network=policy_network, z_normalization=True, conf_penalty=self.conf_penalty) self.length_prior = PoissonLengthPrior(lmbd=3.3, weight=1) self.segmenting_encoder = SegmentingSeqTransducer( embed_encoder = self.segment_encoder_bilstm, segment_composer = self.segment_composer, final_transducer = BiLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim), policy_learning = self.policy_gradient, eps_greedy = self.eps_greedy, length_prior = self.length_prior, ) self.model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100), encoder=self.segmenting_encoder, attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim), decoder=AutoRegressiveDecoder(input_dim=layer_dim, rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, decoder_input_dim=layer_dim, yaml_path="decoder"), transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim, aux_input_dim=layer_dim), scorer=Softmax(vocab_size=100, input_dim=layer_dim), embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100), bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)), ) event_trigger.set_train(True) self.layer_dim = layer_dim self.src_data = list(self.model.src_reader.read_sents("test/data/head.ja")) self.trg_data = list(self.model.trg_reader.read_sents("test/data/head.en")) my_batcher = batchers.TrgBatcher(batch_size=3) self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data) dy.renew_cg(immediate_compute=True, check_validity=True)
def setUp(self): events.clear() ParamManager.init_param_col() src_vocab = Vocab(vocab_file="test/data/head.ja.vocab") trg_vocab = Vocab(vocab_file="test/data/head.en.vocab") self.src_reader = PlainTextReader(vocab=src_vocab) self.trg_reader = PlainTextReader(vocab=trg_vocab) self.src_data = list(self.src_reader.read_sents("test/data/head.ja")) self.trg_data = list(self.trg_reader.read_sents("test/data/head.en"))
def setUp(self): # Seeding numpy.random.seed(2) random.seed(2) layer_dim = 4 xnmt.events.clear() ParamManager.init_param_col() self.segment_composer = SumComposer() self.src_reader = CharFromWordTextReader(vocab=Vocab( vocab_file="examples/data/head.ja.charvocab")) self.trg_reader = PlainTextReader(vocab=Vocab( vocab_file="examples/data/head.en.vocab")) self.loss_calculator = FeedbackLoss(child_loss=MLELoss(), repeat=5) self.segmenting_encoder = SegmentingSeqTransducer( segment_composer=self.segment_composer, final_transducer=BiLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim), ) self.model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100), encoder=self.segmenting_encoder, attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim), decoder=AutoRegressiveDecoder( input_dim=layer_dim, rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, decoder_input_dim=layer_dim, yaml_path="decoder"), transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim, aux_input_dim=layer_dim), scorer=Softmax(vocab_size=100, input_dim=layer_dim), embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100), bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)), ) event_trigger.set_train(True) self.layer_dim = layer_dim self.src_data = list( self.model.src_reader.read_sents("examples/data/head.ja")) self.trg_data = list( self.model.trg_reader.read_sents("examples/data/head.en")) my_batcher = batchers.TrgBatcher(batch_size=3) self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data) dy.renew_cg(immediate_compute=True, check_validity=True)
def setUp(self): # Seeding numpy.random.seed(2) random.seed(2) layer_dim = 32 xnmt.events.clear() ParamManager.init_param_col() self.src_reader = PlainTextReader(vocab=Vocab( vocab_file="test/data/head.ja.vocab")) self.trg_reader = PlainTextReader(vocab=Vocab( vocab_file="test/data/head.en.vocab")) self.layer_dim = layer_dim self.src_data = list(self.src_reader.read_sents("test/data/head.ja")) self.trg_data = list(self.trg_reader.read_sents("test/data/head.en")) self.input_vocab_size = len(self.src_reader.vocab.i2w) self.output_vocab_size = len(self.trg_reader.vocab.i2w) self.loss_calculator = MLELoss() self.model = SimultaneousTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=self.input_vocab_size), encoder=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim), attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim), decoder=AutoRegressiveDecoder( input_dim=layer_dim, rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, decoder_input_dim=layer_dim, yaml_path="decoder"), transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim, aux_input_dim=layer_dim), scorer=Softmax(vocab_size=self.output_vocab_size, input_dim=layer_dim), embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=self.output_vocab_size), bridge=NoBridge(dec_dim=layer_dim, dec_layers=1)), ) event_trigger.set_train(True) my_batcher = batchers.TrgBatcher(batch_size=3) self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data) dy.renew_cg(immediate_compute=True, check_validity=True)
def __init__(self, input_dim=Ref("exp_global.default_layer_dim"), param_init=Ref("exp_global.param_init", default=bare(GlorotInitializer)), bias_init=Ref("exp_global.bias_init", default=bare(ZeroInitializer)), num_heads=8): assert (input_dim % num_heads == 0) param_collection = ParamManager.my_params(self) self.input_dim = input_dim self.num_heads = num_heads self.head_dim = input_dim // num_heads self.pWq, self.pWk, self.pWv, self.pWo = [ param_collection.add_parameters(dim=(input_dim, input_dim), init=param_init.initializer( (input_dim, input_dim))) for _ in range(4) ] self.pbq, self.pbk, self.pbv, self.pbo = [ param_collection.add_parameters(dim=(1, input_dim), init=bias_init.initializer(( 1, input_dim, ))) for _ in range(4) ]
def __init__(self, e0: numbers.Real = 0.1, eps: numbers.Real = 1e-20, skip_noisy: bool = False) -> None: super().__init__(optimizer=dy.AdagradTrainer( ParamManager.global_collection(), e0, eps=eps), skip_noisy=skip_noisy)
def __init__(self, filter_height, filter_width, channels, num_filters, stride): """ Args: num_layers: depth of the RNN input_dim: size of the inputs hidden_dim: size of the outputs (and intermediate RNN layer representations) """ model = ParamManager.my_params(self) self.filter_height = filter_height self.filter_width = filter_width self.channels = channels self.num_filters = num_filters self.stride = stride # (2,2) self.hidden_states = {} normalInit = dy.NormalInitializer(0, 0.1) self.filters1 = model.add_parameters( dim=(self.filter_height[0], self.filter_width[0], self.channels[0], self.num_filters[0]), init=normalInit) self.filters2 = model.add_parameters( dim=(self.filter_height[1], self.filter_width[1], self.channels[1], self.num_filters[1]), init=normalInit) self.filters3 = model.add_parameters( dim=(self.filter_height[2], self.filter_width[2], self.channels[2], self.num_filters[2]), init=normalInit)
def __init__(self, eps: numbers.Real = 1e-6, rho: numbers.Real = 0.95, skip_noisy: bool = False) -> None: super().__init__(optimizer=dy.AdadeltaTrainer( ParamManager.global_collection(), eps, rho), skip_noisy=skip_noisy)
def __init__(self, e0: numbers.Real = 0.01, mom: numbers.Real = 0.9, skip_noisy: bool = False) -> None: super().__init__(optimizer=dy.MomentumSGDTrainer( ParamManager.global_collection(), e0, mom), skip_noisy=skip_noisy)
def __init__(self, layers, input_dim, hidden_dim, param_init=Ref("exp_global.param_init", default=bare(GlorotInitializer)), bias_init=Ref("exp_global.bias_init", default=bare(ZeroInitializer))): if layers != 1: raise RuntimeError( "CustomLSTMSeqTransducer supports only exactly one layer") self.input_dim = input_dim self.hidden_dim = hidden_dim model = ParamManager.my_params(self) # [i; f; o; g] self.p_Wx = model.add_parameters(dim=(hidden_dim * 4, input_dim), init=param_init.initializer( (hidden_dim * 4, input_dim))) self.p_Wh = model.add_parameters(dim=(hidden_dim * 4, hidden_dim), init=param_init.initializer( (hidden_dim * 4, hidden_dim))) self.p_b = model.add_parameters(dim=(hidden_dim * 4, ), init=bias_init.initializer( (hidden_dim * 4, )))
def setUp(self): # Seeding numpy.random.seed(2) random.seed(2) layer_dim = 32 xnmt.events.clear() ParamManager.init_param_col() src_vocab = Vocab(vocab_file="examples/data/head.ja.vocab") self.src_reader = CompoundReader(readers=[ PlainTextReader(vocab=src_vocab), SimultActionTextReader() ], vocab=src_vocab) self.trg_reader = PlainTextReader(vocab=Vocab(vocab_file="examples/data/head.en.vocab")) self.layer_dim = layer_dim self.src_data = list(self.src_reader.read_sents(["examples/data/head.ja", "examples/data/simult/head.jaen.actions"])) self.trg_data = list(self.trg_reader.read_sents("examples/data/head.en")) self.input_vocab_size = len(self.src_reader.vocab.i2w) self.output_vocab_size = len(self.trg_reader.vocab.i2w) self.loss_calculator = loss_calculators.MLELoss() self.model = SimultaneousTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=self.input_vocab_size), encoder=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim), attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim), decoder=AutoRegressiveDecoder(input_dim=layer_dim, rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, decoder_input_dim=layer_dim, yaml_path="decoder"), transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim, aux_input_dim=layer_dim), scorer=Softmax(vocab_size=self.output_vocab_size, input_dim=layer_dim), embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=self.output_vocab_size), bridge=NoBridge(dec_dim=layer_dim, dec_layers=1)), policy_network = network.PolicyNetwork(transforms.MLP(2*self.layer_dim, self.layer_dim, 2)), policy_train_oracle=True, policy_test_oracle=True ) event_trigger.set_train(True) my_batcher = batchers.TrgBatcher(batch_size=3) self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data) dy.renew_cg(immediate_compute=True, check_validity=True)
def __init__(self, alpha: numbers.Real = 0.001, beta_1: numbers.Real = 0.9, beta_2: numbers.Real = 0.999, eps: numbers.Real = 1e-8, skip_noisy: bool = False) -> None: super().__init__(optimizer=dy.AdamTrainer( ParamManager.global_collection(), alpha, beta_1, beta_2, eps), skip_noisy=skip_noisy)
def __init__(self, layers=1, input_dim=Ref("exp_global.default_layer_dim"), hidden_dim=Ref("exp_global.default_layer_dim"), dropout=Ref("exp_global.dropout", default=0.0), weightnoise_std=Ref("exp_global.weight_noise", default=0.0), param_init=Ref("exp_global.param_init", default=bare(GlorotInitializer)), bias_init=Ref("exp_global.bias_init", default=bare(ZeroInitializer)), yaml_path=None, decoder_input_dim=Ref("exp_global.default_layer_dim", default=None), decoder_input_feeding=True): self.num_layers = layers model = ParamManager.my_params(self) if yaml_path is not None and "decoder" in yaml_path: if decoder_input_feeding: input_dim += decoder_input_dim self.hidden_dim = hidden_dim self.dropout_rate = dropout self.weightnoise_std = weightnoise_std self.input_dim = input_dim if not isinstance(param_init, Sequence): param_init = [param_init] * layers if not isinstance(bias_init, Sequence): bias_init = [bias_init] * layers # [i; f; o; g] self.p_Wx = [ model.add_parameters(dim=(hidden_dim * 4, input_dim), init=param_init[0].initializer( (hidden_dim * 4, input_dim), num_shared=4)) ] self.p_Wx += [ model.add_parameters(dim=(hidden_dim * 4, hidden_dim), init=param_init[i].initializer( (hidden_dim * 4, hidden_dim), num_shared=4)) for i in range(1, layers) ] self.p_Wh = [ model.add_parameters(dim=(hidden_dim * 4, hidden_dim), init=param_init[i].initializer( (hidden_dim * 4, hidden_dim), num_shared=4)) for i in range(layers) ] self.p_b = [ model.add_parameters(dim=(hidden_dim * 4, ), init=bias_init[i].initializer( (hidden_dim * 4, ), num_shared=4)) for i in range(layers) ] self.dropout_mask_x = None self.dropout_mask_h = None
def __init__(self, alpha=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, update_every: int = 1, skip_noisy: bool = False): super().__init__(optimizer=dy.AdamTrainer( ParamManager.global_collection(), alpha, beta_1, beta_2, eps), skip_noisy=skip_noisy)
def __init__(self, layers=1, input_dim=512, h=1, dropout=0.0, attn_dropout=False, layer_norm=False, **kwargs): dy_model = ParamManager.my_params(self) self.layer_names = [] for i in range(1, layers + 1): name = 'l{}'.format(i) layer = EncoderLayer(dy_model, input_dim, h, attn_dropout, layer_norm) self.layer_names.append((name, layer)) self.dropout_val = dropout
def __init__(self, filter_height, filter_width, channels, num_filters, stride, rhn_num_hidden_layers, rhn_dim, rhn_microsteps, attention_dim, residual=False): self.filter_height = filter_height self.filter_width = filter_width self.channels = channels self.num_filters = num_filters self.stride = stride self.rhn_num_hidden_layers = rhn_num_hidden_layers self.rhn_dim = rhn_dim self.rhn_microsteps = rhn_microsteps self.attention_dim = attention_dim self.residual = residual model = ParamManager.my_params(self) # Convolutional layer self.filter_conv = model.add_parameters(dim=(self.filter_height, self.filter_width, self.channels, self.num_filters)) # Recurrent highway layer self.recur = [] self.linear = [] self.init = [] self.attention = [] input_dim = num_filters for _ in range(rhn_num_hidden_layers): self.init.append(model.add_parameters((rhn_dim, ))) self.linear.append((model.add_parameters((rhn_dim, input_dim)), model.add_parameters(( rhn_dim, input_dim, )))) input_dim = rhn_dim recur_layer = [] for _ in range(self.rhn_microsteps): recur_layer.append((model.add_parameters( (rhn_dim, rhn_dim)), model.add_parameters( (rhn_dim, )), model.add_parameters(( rhn_dim, rhn_dim, )), model.add_parameters((rhn_dim, )))) self.recur.append(recur_layer) # Attention layer self.attention.append((model.add_parameters( (attention_dim, rhn_dim)), model.add_parameters(attention_dim, )))
def __init__(self, child: SeqTransducer, input_dim: int, layer_norm: bool = False): self.child = child self.input_dim = input_dim self.layer_norm = layer_norm if layer_norm: model = ParamManager.my_params(self) self.ln_g = model.add_parameters(dim=(input_dim, )) self.ln_b = model.add_parameters(dim=(input_dim, ))
def setUp(self): # Seeding np.random.seed(2) random.seed(2) layer_dim = 4 xnmt.events.clear() ParamManager.init_param_col() self.src_vocab = Vocab(vocab_file="examples/data/head.ja.vocab") self.src_char_vocab = CharVocab(vocab_file="examples/data/head.ja.vocab") self.ngram_vocab = Vocab(vocab_file="examples/data/head.ngramcount.ja") self.trg_vocab = Vocab(vocab_file="examples/data/head.en.vocab") self.src_reader = CharFromWordTextReader(vocab= self.src_vocab, char_vocab= self.src_char_vocab) self.trg_reader = PlainTextReader(vocab=self.trg_vocab) self.layer_dim = layer_dim self.src_data = list(self.src_reader.read_sents("examples/data/head.ja")) self.trg_data = list(self.trg_reader.read_sents("examples/data/head.en")) self.src, self.trg = batchers.TrgBatcher(batch_size=3).pack(self.src_data, self.trg_data) dy.renew_cg(immediate_compute=True, check_validity=True)
def __init__(self, ngram_size, param_init=Ref("exp_global.param_init", default=bare(GlorotInitializer)), bias_init=Ref("exp_global.param_init", default=bare(ZeroInitializer)), embed_dim=Ref("exp_global.default_layer_dim"), hidden_dim=Ref("exp_global.default_layer_dim")): model = ParamManager.my_params(self) dim = (1, ngram_size, embed_dim, hidden_dim) self.filter = model.add_parameters(dim=dim, init=param_init.initializer(dim)) self.bias = model.add_parameters(dim=(embed_dim,), init=bias_init.initializer(dim)) self.ngram_size = ngram_size self.embed_dim = embed_dim
def __init__(self, input_dim, window_receptor, output_dim, num_layers, internal_dim, non_linearity='linear'): """ Args: num_layers: num layers after first receptor conv input_dim: size of the inputs window_receptor: window for the receptor ouput_dim: size of the outputs internal_dim: size of hidden dimension, internal dimension non_linearity: Non linearity to apply between layers """ model = ParamManager.my_params(self) self.input_dim = input_dim self.window_receptor = window_receptor self.internal_dim = internal_dim self.non_linearity = non_linearity self.output_dim = output_dim if self.non_linearity == 'linear': self.gain = 1.0 elif self.non_linearity == 'tanh': self.gain = 1.0 elif self.non_linearity == 'relu': self.gain = 0.5 elif self.non_linearity == 'sigmoid': self.gain = 4.0 normalInit = dy.NormalInitializer(0, 0.1) self.pConv1 = model.add_parameters(dim=(self.input_dim, self.window_receptor, 1, self.internal_dim), init=normalInit) self.pBias1 = model.add_parameters(dim=(self.internal_dim, )) self.builder_layers = [] for _ in range(num_layers): conv = model.add_parameters(dim=(self.internal_dim, 1, 1, self.internal_dim), init=normalInit) bias = model.add_parameters(dim=(self.internal_dim, )) self.builder_layers.append((conv, bias)) self.last_conv = model.add_parameters(dim=(self.internal_dim, 1, 1, self.output_dim), init=normalInit) self.last_bias = model.add_parameters(dim=(self.output_dim, ))
def __init__(self, layers=1, input_dim=512, h=1, dropout=0.0, attn_dropout=False, layer_norm=False, vocab_size = None, vocab = None, trg_reader = Ref("model.trg_reader")): dy_model = ParamManager.my_params(self) self.layer_names = [] for i in range(1, layers + 1): name = 'l{}'.format(i) layer = DecoderLayer(dy_model, input_dim, h, attn_dropout, layer_norm) self.layer_names.append((name, layer)) self.vocab_size = self.choose_vocab_size(vocab_size, vocab, trg_reader) self.output_affine = LinearSent(dy_model, input_dim, self.vocab_size) self.dropout_val = dropout
def __init__(self, in_height, out_height): """ Args: num_layers: depth of the RNN input_dim: size of the inputs hidden_dim: size of the outputs (and intermediate RNN layer representations) """ model = ParamManager.my_params(self) self.in_height = in_height self.out_height = out_height normalInit=dy.NormalInitializer(0, 0.1) self.pW = model.add_parameters(dim = (self.out_height, self.in_height), init=normalInit) self.pb = model.add_parameters(dim = self.out_height)
def update(self) -> None: self.global_step += 1 if self.rescale_grads: torch.nn.utils.clip_grad_norm_(ParamManager.global_collection().parameters(), self.rescale_grads) self.scheduler.step() if settings.USE_TENSORBOARD: tee.tensorboard_writer.add_scalars(name="lr", tag_scalar_dict={"lr": self.learning_rate * self.lr_factor}, global_step=self.global_step) if not self.skip_noisy: tee.tensorboard_writer.add_scalars(name="grad", tag_scalar_dict={"norm": np.exp(self.grad_log_norm())}, global_step=self.global_step) if not (self.skip_noisy and self.check_gradients_noisy()): self.optimizer.step() else: logger.info("skipping noisy update")
def __init__(self, word_vocab=None, src_vocab=Ref(Path("model.src_reader.vocab")), hidden_dim=Ref("exp_global.default_layer_dim"), vocab_size=25000): super().__init__() param_collection = ParamManager.my_params(self) if word_vocab is None: word_vocab = Vocab() dict_entry = vocab_size else: dict_entry = len(word_vocab) self.src_vocab = src_vocab self.word_vocab = word_vocab self.embedding = param_collection.add_lookup_parameters((dict_entry, hidden_dim))
def __init__(self, e0: numbers.Real = 0.1, momentum: numbers.Real = 0.0, weight_decay: numbers.Real = 0.0, dampening: numbers.Real = 0.0, nesterov: bool = False, skip_noisy: bool = False, rescale_grads: numbers.Real = 5.0) -> None: super().__init__(optimizer=torch.optim.SGD(params=ParamManager.global_collection().parameters(), lr=e0, momentum=momentum, weight_decay=weight_decay, dampening=dampening, nesterov=nesterov), skip_noisy=skip_noisy, rescale_grads=rescale_grads)
def __init__(self, alpha: numbers.Real = 1.0, dim: numbers.Integral = 512, warmup_steps: Optional[numbers.Integral] = 4000, beta_1: numbers.Real = 0.9, beta_2: numbers.Real = 0.98, eps: numbers.Real = 1e-9, skip_noisy: bool = False, rescale_grads: numbers.Real = 5.0) -> None: super().__init__(optimizer=torch.optim.Adam(params=ParamManager.global_collection().parameters(), lr=alpha, betas=(beta_1, beta_2), eps=eps), skip_noisy=skip_noisy, rescale_grads=rescale_grads) self.dim = dim self.warmup_steps = warmup_steps self.steps = 0