def get_ae_pretrainer(layer, data, batch_size, epochs=30): init_lr = 0.05 train_algo = SGD( batch_size=batch_size, learning_rate=init_lr, learning_rule=Momentum(init_momentum=0.5), monitoring_batches=batch_size, monitoring_dataset=data, # for ContractiveAutoencoder: # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()], # [0.5, cost.MethodCost(method='contraction_penalty')]]), # for HigherOrderContractiveAutoencoder: # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()], # [0.5, cost.MethodCost(method='contraction_penalty')], # [0.5, cost.MethodCost(method='higher_order_penalty')]]), # for DenoisingAutoencoder: cost=MeanSquaredReconstructionError(), termination_criterion=EpochCounter(epochs)) return Train(model=layer, algorithm=train_algo, dataset=data, extensions=[ MomentumAdjustor(final_momentum=0.9, start=0, saturate=25), LinearDecayOverEpoch(start=1, saturate=25, decay_factor=.02) ])
def create_adjustors(self): initial_momentum = .5 final_momentum = .99 start = 1 saturate = self.max_epochs self.momentum_adjustor = learning_rule.MomentumAdjustor( final_momentum, start, saturate) self.momentum_rule = learning_rule.Momentum(initial_momentum, nesterov_momentum=True) if self.lr_monitor_decay: self.learning_rate_adjustor = MonitorBasedLRAdjuster( high_trigger=1., shrink_amt=0.9, low_trigger=.95, grow_amt=1.1, channel_name='train_objective') elif self.lr_lin_decay: self.learning_rate_adjustor = LinearDecayOverEpoch( start, saturate, self.lr_lin_decay)
def create_adjustors(self): initial_momentum = .5 final_momentum = .99 start = 1 saturate = self.max_epochs self.momentum_adjustor = learning_rule.MomentumAdjustor( final_momentum, start, saturate) self.momentum_rule = learning_rule.Momentum(initial_momentum, nesterov_momentum=True) if self.lr_monitor_decay: self.learning_rate_adjustor = MonitorBasedLRAdjuster( high_trigger=1., shrink_amt=0.9, low_trigger=.95, grow_amt=1.1, channel_name='train_objective') elif self.lr_lin_decay: self.learning_rate_adjustor = LinearDecayOverEpoch( start, saturate, self.lr_lin_decay)
def get_finetuner(model, trainset, batch_size=100, epochs=100): train_algo = SGD(batch_size=batch_size, learning_rule=Momentum(init_momentum=0.5), learning_rate=0.5, monitoring_batches=batch_size, monitoring_dataset=trainset, cost=Dropout(input_include_probs={'h0': .5}, input_scales={'h0': 2.}), termination_criterion=EpochCounter(epochs)) path = DATA_DIR + 'model' + str(SUBMODEL) + 'saved_daex.pkl' return Train(model=model, algorithm=train_algo, dataset=trainset, save_path=path, save_freq=10, extensions=[ MomentumAdjustor(final_momentum=0.9, start=0, saturate=int(epochs * 0.8)), LinearDecayOverEpoch(start=1, saturate=int(epochs * 0.7), decay_factor=.02) ])
def get_trainer2(model, trainset, epochs=50): train_algo = SGD( batch_size=bsize, learning_rate=0.5, learning_rule=Momentum(init_momentum=0.5), monitoring_batches=bsize, monitoring_dataset=trainset, cost=Dropout(input_include_probs={'h0': .8}, input_scales={'h0': 1.}), termination_criterion=EpochCounter(epochs), ) path = DATA_DIR + 'model2saved_conv.pkl' return Train(model=model, algorithm=train_algo, dataset=trainset, save_path=path, save_freq=1, extensions=[ MomentumAdjustor(final_momentum=0.7, start=0, saturate=int(epochs * 0.5)), LinearDecayOverEpoch(start=1, saturate=int(epochs * 0.8), decay_factor=.01) ])
def get_trainer(model, trainset, validset, save_path): monitoring = dict(valid=validset, train=trainset) termination = MonitorBased(channel_name='valid_y_misclass', prop_decrease=.001, N=100) extensions = [MonitorBasedSaveBest(channel_name='valid_y_misclass', save_path=save_path), #MomentumAdjustor(start=1, saturate=100, final_momentum=.9), LinearDecayOverEpoch(start=1, saturate=200, decay_factor=0.01)] config = { 'learning_rate': .01, #'learning_rule': Momentum(0.5), 'learning_rule': RMSProp(), 'train_iteration_mode': 'shuffled_sequential', 'batch_size': 1200,#250, #'batches_per_iter' : 100, 'monitoring_dataset': monitoring, 'monitor_iteration_mode' : 'shuffled_sequential', 'termination_criterion' : termination, } return Train(model=model, algorithm=SGD(**config), dataset=trainset, extensions=extensions)
def test_linear_decay_over_epoch(): # tests that the class LinearDecayOverEpoch in sgd.py # gets the learning rate properly over the training epochs # it runs a small softmax and at the end checks the learning values. # the learning rates are expected to start changing at epoch 'start' by an amount of 'step' specified below. # the decrease of the learning rate should continue linearly untill we reach epoch 'saturate' at which the learning rate equals 'learning_rate * decay_factor' dim = 3 m = 10 rng = np.random.RandomState([25, 9, 2012]) X = rng.randn(m, dim) dataset = DenseDesignMatrix(X=X) m = 15 X = rng.randn(m, dim) # including a monitoring datasets lets us test that # the monitor works with supervised data monitoring_dataset = DenseDesignMatrix(X=X) model = SoftmaxModel(dim) learning_rate = 1e-1 batch_size = 5 # We need to include this so the test actually stops running at some point epoch_num = 15 termination_criterion = EpochCounter(epoch_num) cost = DummyCost() algorithm = SGD(learning_rate, cost, batch_size=5, monitoring_batches=3, monitoring_dataset=monitoring_dataset, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) start = 5 saturate = 10 decay_factor = 0.1 linear_decay = LinearDecayOverEpoch(start=start, saturate=saturate, decay_factor=decay_factor) train = Train(dataset, model, algorithm, save_path=None, save_freq=0, extensions=[linear_decay]) train.main_loop() lr = model.monitor.channels['learning_rate'] step = (learning_rate - learning_rate * decay_factor) / (saturate - start + 1) for i in xrange(epoch_num + 1): actual = lr.val_record[i] if i < start: expected = learning_rate elif i >= saturate: expected = learning_rate * decay_factor elif (start <= i) and (i < saturate): expected = decay_factor * learning_rate + (saturate - i) * step if not np.allclose(actual, expected): raise AssertionError( "After %d epochs, expected learning rate to be %f, but it is %f." % (i, actual, expected))
def main(): #creating layers #2 convolutional rectified layers, border mode valid batch_size = 64 lr = 0.1 / 20 finMomentum = 0.9 maxout_units = 2000 num_pcs = 4 lay1_reg = lay2_reg = maxout_reg = None #save_path = './models/no_maxout/titan_lr_0.1_btch_64_momFinal_0.9_maxout_2000_4.joblib' #best_path = '/models/no_maxout/titan_bart10_gpu2_best.joblib' #save_path = './models/'+params.host+'_'+params.device+'_'+sys.argv[1]+'.joblib' #best_path = './models/'+params.host+'_'+params.device+'_'+sys.argv[1]+'best.joblib' save_path = '/Tmp/zumerjer/eos1_sumcost_nodrop_noada_small_ema.joblib' best_path = '/Tmp/zumerjer/eos1_sumcost_nodrop_noada_small_ema_best.joblib' #numBatches = 400000/batch_size ''' print 'Applying preprocessing' ddmTrain = EmotiwKeypoints(start=0, stop =40000) ddmValid = EmotiwKeypoints(start=40000, stop = 44000) ddmTest = EmotiwKeypoints(start=44000) stndrdz = preprocessing.Standardize() stndrdz.applyLazily(ddmTrain, can_fit=True, name = 'train') stndrdz.applyLazily(ddmValid, can_fit=False, name = 'val') stndrdz.applyLazily(ddmTest, can_fit=False, name = 'test') GCN = preprocessing.GlobalContrastNormalization(batch_size = 1000) GCN.apply(ddmTrain, can_fit =True, name = 'train') GCN.apply(ddmValid, can_fit =False, name = 'val') GCN.apply(ddmTest, can_fit = False, name = 'test') return ''' ddmTrain = ComboDatasetPyTable('/Tmp/zumerjer/all_', which_set='train') ddmValid = ComboDatasetPyTable('/Tmp/zumerjer/all_', which_set='valid') ddmSmallTrain = ComboDatasetPyTable('/Tmp/zumerjer/all_', which_set='small_train') layer1 = ConvRectifiedLinear(layer_name='convRect1', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[4, 4], pool_stride=[2, 2], W_lr_scale=0.1, max_kernel_norm=lay1_reg) layer2 = ConvRectifiedLinear(layer_name='convRect2', output_channels=128, irange=.05, kernel_shape=[5, 5], pool_shape=[3, 3], pool_stride=[2, 2], W_lr_scale=0.1, max_kernel_norm=lay2_reg) # Rectified linear units #layer3 = RectifiedLinear(dim = 3000, # sparse_init = 15, # layer_name = 'RectLin3') #Maxout layer maxout = Maxout(layer_name='maxout', irange=.005, num_units=maxout_units, num_pieces=num_pcs, W_lr_scale=0.1, max_col_norm=maxout_reg) #multisoftmax n_groups = 196 n_classes = 96 layer_name = 'multisoftmax' layerMS = MultiSoftmax(n_groups=n_groups, irange=0.05, n_classes=n_classes, layer_name=layer_name) #setting up MLP0 MLPerc = MLP(batch_size=batch_size, input_space=Conv2DSpace(shape=[96, 96], num_channels=3, axes=('b', 0, 1, 'c')), layers=[layer1, layer2, maxout, layerMS]) #mlp_cost missing_target_value = -1 mlp_cost = MLPCost(cost_type='default', missing_target_value=missing_target_value) #mlp_cost.setup_dropout(input_include_probs= { 'convRect1' : 0.8 }, input_scales= { 'convRect1': 1. }) #dropout_cost = Dropout(input_include_probs= { 'convRect1' : .8 }, # input_scales= { 'convRect1': 1. }) #algorithm monitoring_dataset = {'validation': ddmValid, 'mini-train': ddmSmallTrain} term_crit = MonitorBased(prop_decrease=1e-7, N=100, channel_name='validation_objective') kpSGD = KeypointSGD(learning_rate=lr, init_momentum=0.5, monitoring_dataset=monitoring_dataset, batch_size=batch_size, termination_criterion=term_crit, cost=mlp_cost) #train extension #train_ext = ExponentialDecayOverEpoch(decay_factor = 0.998, min_lr_scale = 0.001) train_ext = LinearDecayOverEpoch(start=1, saturate=250, decay_factor=.01) #train_ext = ADADELTA(0.95) #train object train = Train(dataset=ddmTrain, save_path=save_path, save_freq=10, model=MLPerc, algorithm=kpSGD, extensions=[ train_ext, MonitorBasedSaveBest(channel_name='validation_objective', save_path=best_path), MomentumAdjustor(start=1, saturate=25, final_momentum=finMomentum) ]) train.main_loop() train.save()
class SequenceTaggerNetwork(Model): def __init__(self, dataset, w2i, t2i, featurizer, edim=None, hdims=None, fedim=None, max_epochs=100, use_momentum=False, lr=.01, lr_lin_decay=None, lr_scale=False, lr_monitor_decay=False, valid_stop=False, reg_factors=None, dropout=False, dropout_params=None, embedding_init=None, embedded_model=None, monitor_train=True, plot_monitor=None, num=False): super(SequenceTaggerNetwork, self).__init__() self.vocab_size = dataset.vocab_size self.window_size = dataset.window_size self.total_feats = dataset.total_feats self.feat_num = dataset.feat_num self.n_classes = dataset.n_classes self.max_epochs = max_epochs if edim is None: edim = 50 if hdims is None: hdims = [100] if fedim is None: fedim = 5 self.edim = edim self.fedim = fedim self.hdims = hdims self.w2i = w2i self.t2i = t2i self.featurizer = featurizer self._create_tagger() A_value = numpy.random.uniform(low=-.1, high=.1, size=(self.n_classes + 2, self.n_classes)) self.A = sharedX(A_value, name='A') self.use_momentum = use_momentum self.lr = lr self.lr_lin_decay = lr_lin_decay self.lr_monitor_decay = lr_monitor_decay self.lr_scale = lr_scale self.valid_stop = valid_stop self.reg_factors = reg_factors self.close_cache = {} self.dropout_params = dropout_params self.dropout = dropout or self.dropout_params is not None self.hdims = hdims self.monitor_train = monitor_train self.num = num self.plot_monitor = plot_monitor if embedding_init is not None: self.set_embedding_weights(embedding_init) def _create_tagger(self): self.tagger = WordTaggerNetwork(self.vocab_size, self.window_size, self.total_feats, self.feat_num, self.hdims, self.edim, self.fedim, self.n_classes) def _create_data_specs(self, dataset): self.input_space = CompositeSpace([ dataset.data_specs[0].components[i] for i in xrange(len(dataset.data_specs[0].components) - 1) ]) self.output_space = dataset.data_specs[0].components[-1] self.input_source = dataset.data_specs[1][:-1] self.target_source = dataset.data_specs[1][-1] def __getstate__(self): d = {} d['vocab_size'] = self.vocab_size d['window_size'] = self.window_size d['feat_num'] = self.feat_num d['total_feats'] = self.total_feats d['n_classes'] = self.n_classes d['input_space'] = self.input_space d['output_space'] = self.output_space d['input_source'] = self.input_source d['target_source'] = self.target_source d['A'] = self.A d['tagger'] = self.tagger d['w2i'] = self.w2i d['t2i'] = self.t2i d['featurizer'] = self.featurizer d['max_epochs'] = self.max_epochs d['use_momentum'] = self.use_momentum d['lr'] = self.lr d['lr_lin_decay'] = self.lr_lin_decay d['lr_monitor_decay'] = self.lr_monitor_decay d['lr_scale'] = self.lr_scale d['valid_stop'] = self.valid_stop d['reg_factors'] = self.reg_factors d['dropout'] = self.dropout d['dropout_params'] = self.dropout_params d['monitor_train'] = self.monitor_train d['num'] = self.num d['plot_monitor'] = self.plot_monitor return d def fprop(self, data): tagger_out = self.tagger.fprop(data) probs = T.concatenate([self.A, tagger_out]) return probs def dropout_fprop(self, data, default_input_include_prob=0.5, input_include_probs=None, default_input_scale=2.0, input_scales=None, per_example=True): if input_scales is None: input_scales = {'input': 1.0} if input_include_probs is None: input_include_probs = {'input': 1.0} if self.dropout_params is not None: if len(self.dropout_params) == len(self.tagger.layers) - 1: input_include_probs['tagger_out'] = self.dropout_params[-1] input_scales['tagger_out'] = 1.0 / self.dropout_params[-1] for i, p in enumerate(self.dropout_params[:-1]): input_include_probs['h{0}'.format(i)] = p input_scales['h{0}'.format(i)] = 1.0 / p tagger_out = self.tagger.dropout_fprop(data, default_input_include_prob, input_include_probs, default_input_scale, input_scales, per_example) probs = T.concatenate([self.A, tagger_out]) return probs @functools.wraps(Model.get_lr_scalers) def get_lr_scalers(self): if not self.lr_scale: return {} d = self.tagger.get_lr_scalers() d[self.A] = 1. / self.n_classes return d @functools.wraps(Model.get_params) def get_params(self): return self.tagger.get_params() + [self.A] def create_adjustors(self): initial_momentum = .5 final_momentum = .99 start = 1 saturate = self.max_epochs self.momentum_adjustor = learning_rule.MomentumAdjustor( final_momentum, start, saturate) self.momentum_rule = learning_rule.Momentum(initial_momentum, nesterov_momentum=True) if self.lr_monitor_decay: self.learning_rate_adjustor = MonitorBasedLRAdjuster( high_trigger=1., shrink_amt=0.9, low_trigger=.95, grow_amt=1.1, channel_name='train_objective') elif self.lr_lin_decay: self.learning_rate_adjustor = LinearDecayOverEpoch( start, saturate, self.lr_lin_decay) def compute_used_inputs(self): seen = {'words': set(), 'feats': set()} for sen_w in self.dataset['train'].X1: seen['words'] |= reduce(lambda x, y: set(x) | set(y), sen_w, set()) for sen_f in self.dataset['train'].X2: seen['feats'] |= reduce(lambda x, y: set(x) | set(y), sen_f, set()) words = set(xrange(len(self.w2i))) feats = set(xrange(self.total_feats)) self.notseen = { 'words': numpy.array(sorted(words - seen['words'])), 'feats': numpy.array(sorted(feats - seen['feats'])) } def set_dataset(self, data): self._create_data_specs(data['train']) self.dataset = data self.compute_used_inputs() self.tagger.notseen = self.notseen def create_algorithm(self, data, save_best_path=None): self.set_dataset(data) self.create_adjustors() term = EpochCounter(max_epochs=self.max_epochs) if self.valid_stop: cost_crit = MonitorBased(channel_name='valid_objective', prop_decrease=.0, N=3) term = And(criteria=[cost_crit, term]) #(layers, A_weight_decay) coeffs = None if self.reg_factors: rf = self.reg_factors lhdims = len(self.tagger.hdims) l_inputlayer = len(self.tagger.layers[0].layers) coeffs = ([[rf] * l_inputlayer] + ([rf] * lhdims) + [rf], rf) cost = SeqTaggerCost(coeffs, self.dropout) self.cost = cost self.mbsb = MonitorBasedSaveBest(channel_name='valid_objective', save_path=save_best_path) mon_dataset = dict(self.dataset) if not self.monitor_train: del mon_dataset['train'] _learning_rule = (self.momentum_rule if self.use_momentum else None) self.algorithm = SGD( batch_size=1, learning_rate=self.lr, termination_criterion=term, monitoring_dataset=mon_dataset, cost=cost, learning_rule=_learning_rule, ) self.algorithm.setup(self, self.dataset['train']) if self.plot_monitor: cn = ["valid_objective", "test_objective"] if self.monitor_train: cn.append("train_objective") plots = Plots(channel_names=cn, save_path=self.plot_monitor) self.pm = PlotManager([plots], freq=1) self.pm.setup(self, None, self.algorithm) def train(self): while True: if not self.algorithm.continue_learning(self): break self.algorithm.train(dataset=self.dataset['train']) self.monitor.report_epoch() self.monitor() self.mbsb.on_monitor(self, self.dataset['valid'], self.algorithm) if self.use_momentum: self.momentum_adjustor.on_monitor(self, self.dataset['valid'], self.algorithm) if hasattr(self, 'learning_rate_adjustor'): self.learning_rate_adjustor.on_monitor(self, self.dataset['valid'], self.algorithm) if hasattr(self, 'pm'): self.pm.on_monitor(self, self.dataset['valid'], self.algorithm) def prepare_tagging(self): X = self.get_input_space().make_theano_batch(batch_size=1) Y = self.fprop(X) self.f = theano.function([X[0], X[1]], Y) self.start = self.A.get_value()[0] self.end = self.A.get_value()[1] self.A_value = self.A.get_value()[2:] def process_input(self, words, feats): return self.f(words, feats) def tag_sen(self, words, feats, debug=False, return_probs=False): if not hasattr(self, 'f'): self.prepare_tagging() y = self.process_input(words, feats) tagger_out = y[2 + self.n_classes:] res = viterbi(self.start, self.A_value, self.end, tagger_out, self.n_classes, return_probs) if return_probs: return res / res.sum(axis=1)[:, numpy.newaxis] #return res.reshape((1, len(res))) if debug: return numpy.array([[e] for e in res[1]]), tagger_out return numpy.array([[e] for e in res[1]]) def get_score(self, dataset, mode='pwp'): self.prepare_tagging() tagged = (self.tag_sen(w, f) for w, f in izip(dataset.X1, dataset.X2)) gold = dataset.y good, bad = 0., 0. if mode == 'pwp': for t, g in izip(tagged, gold): g = g.argmax(axis=1) t = t.flatten() good += sum(t == g) bad += sum(t != g) return [good / (good + bad)] elif mode == 'f1': i2t = [t for t, i in sorted(self.t2i.items(), key=lambda x: x[1])] f1c = FScCounter(i2t, binary_input=False) gold = map(lambda x: x.argmax(axis=1), gold) tagged = map(lambda x: x.flatten(), tagged) return f1c.count_score(gold, tagged) def set_embedding_weights(self, embedding_init): # load embedding with gensim from gensim.models import Word2Vec try: m = Word2Vec.load_word2vec_format(embedding_init, binary=False) edim = m.layer1_size except UnicodeDecodeError: try: m = Word2Vec.load_word2vec_format(embedding_init, binary=True) edim = m.layer1_size except UnicodeDecodeError: # not in word2vec format m = Word2Vec.load(embedding_init) edim = m.layer1_size except ValueError: # glove model m = {} if embedding_init.endswith('gz'): fp = gzip.open(embedding_init) else: fp = open(embedding_init) for l in fp: le = l.split() m[le[0].decode('utf-8')] = numpy.array( [float(e) for e in le[1:]], dtype=theano.config.floatX) edim = len(le) - 1 if edim != self.edim: raise Exception("Embedding dim and edim doesn't match") m_lower = {} vocab = (m.vocab if hasattr(m, 'vocab') else m) for k in vocab: if k in ['UNKNOWN', 'PADDING']: continue if self.num: m_lower[replace_numerals(k.lower())] = m[k] else: m_lower[k.lower()] = m[k] # transform weight matrix with using self.w2i params = numpy.zeros( self.tagger.layers[0].layers[0].get_param_vector().shape, dtype=theano.config.floatX) e = self.edim for w in self.w2i: if w in m_lower: v = m_lower[w] i = self.w2i[w] params[i * e:(i + 1) * e] = v if 'UNKNOWN' in vocab: params[-1 * e:] = vocab['UNKNOWN'] if 'PADDING' in vocab: params[-2 * e:-1 * e] = vocab['PADDING'] self.tagger.layers[0].layers[0].set_param_vector(params)
class SequenceTaggerNetwork(Model): def __init__(self, dataset, w2i, t2i, featurizer, edim=None, hdims=None, fedim=None, max_epochs=100, use_momentum=False, lr=.01, lr_lin_decay=None, lr_scale=False, lr_monitor_decay=False, valid_stop=False, reg_factors=None, dropout=False, dropout_params=None, embedding_init=None, embedded_model=None, monitor_train=True, plot_monitor=None, num=False): super(SequenceTaggerNetwork, self).__init__() self.vocab_size = dataset.vocab_size self.window_size = dataset.window_size self.total_feats = dataset.total_feats self.feat_num = dataset.feat_num self.n_classes = dataset.n_classes self.max_epochs = max_epochs if edim is None: edim = 50 if hdims is None: hdims = [100] if fedim is None: fedim = 5 self.edim = edim self.fedim = fedim self.hdims = hdims self.w2i = w2i self.t2i = t2i self.featurizer = featurizer self._create_tagger() A_value = numpy.random.uniform(low=-.1, high=.1, size=(self.n_classes + 2, self.n_classes)) self.A = sharedX(A_value, name='A') self.use_momentum = use_momentum self.lr = lr self.lr_lin_decay = lr_lin_decay self.lr_monitor_decay = lr_monitor_decay self.lr_scale = lr_scale self.valid_stop = valid_stop self.reg_factors = reg_factors self.close_cache = {} self.dropout_params = dropout_params self.dropout = dropout or self.dropout_params is not None self.hdims = hdims self.monitor_train = monitor_train self.num = num self.plot_monitor = plot_monitor if embedding_init is not None: self.set_embedding_weights(embedding_init) def _create_tagger(self): self.tagger = WordTaggerNetwork( self.vocab_size, self.window_size, self.total_feats, self.feat_num, self.hdims, self.edim, self.fedim, self.n_classes) def _create_data_specs(self, dataset): self.input_space = CompositeSpace([ dataset.data_specs[0].components[i] for i in xrange(len(dataset.data_specs[0].components) - 1)]) self.output_space = dataset.data_specs[0].components[-1] self.input_source = dataset.data_specs[1][:-1] self.target_source = dataset.data_specs[1][-1] def __getstate__(self): d = {} d['vocab_size'] = self.vocab_size d['window_size'] = self.window_size d['feat_num'] = self.feat_num d['total_feats'] = self.total_feats d['n_classes'] = self.n_classes d['input_space'] = self.input_space d['output_space'] = self.output_space d['input_source'] = self.input_source d['target_source'] = self.target_source d['A'] = self.A d['tagger'] = self.tagger d['w2i'] = self.w2i d['t2i'] = self.t2i d['featurizer'] = self.featurizer d['max_epochs'] = self.max_epochs d['use_momentum'] = self.use_momentum d['lr'] = self.lr d['lr_lin_decay'] = self.lr_lin_decay d['lr_monitor_decay'] = self.lr_monitor_decay d['lr_scale'] = self.lr_scale d['valid_stop'] = self.valid_stop d['reg_factors'] = self.reg_factors d['dropout'] = self.dropout d['dropout_params'] = self.dropout_params d['monitor_train'] = self.monitor_train d['num'] = self.num d['plot_monitor'] = self.plot_monitor return d def fprop(self, data): tagger_out = self.tagger.fprop(data) probs = T.concatenate([self.A, tagger_out]) return probs def dropout_fprop(self, data, default_input_include_prob=0.5, input_include_probs=None, default_input_scale=2.0, input_scales=None, per_example=True): if input_scales is None: input_scales = {'input': 1.0} if input_include_probs is None: input_include_probs = {'input': 1.0} if self.dropout_params is not None: if len(self.dropout_params) == len(self.tagger.layers) - 1: input_include_probs['tagger_out'] = self.dropout_params[-1] input_scales['tagger_out'] = 1.0/self.dropout_params[-1] for i, p in enumerate(self.dropout_params[:-1]): input_include_probs['h{0}'.format(i)] = p input_scales['h{0}'.format(i)] = 1.0/p tagger_out = self.tagger.dropout_fprop( data, default_input_include_prob, input_include_probs, default_input_scale, input_scales, per_example) probs = T.concatenate([self.A, tagger_out]) return probs @functools.wraps(Model.get_lr_scalers) def get_lr_scalers(self): if not self.lr_scale: return {} d = self.tagger.get_lr_scalers() d[self.A] = 1. / self.n_classes return d @functools.wraps(Model.get_params) def get_params(self): return self.tagger.get_params() + [self.A] def create_adjustors(self): initial_momentum = .5 final_momentum = .99 start = 1 saturate = self.max_epochs self.momentum_adjustor = learning_rule.MomentumAdjustor( final_momentum, start, saturate) self.momentum_rule = learning_rule.Momentum(initial_momentum, nesterov_momentum=True) if self.lr_monitor_decay: self.learning_rate_adjustor = MonitorBasedLRAdjuster( high_trigger=1., shrink_amt=0.9, low_trigger=.95, grow_amt=1.1, channel_name='train_objective') elif self.lr_lin_decay: self.learning_rate_adjustor = LinearDecayOverEpoch( start, saturate, self.lr_lin_decay) def compute_used_inputs(self): seen = {'words': set(), 'feats': set()} for sen_w in self.dataset['train'].X1: seen['words'] |= reduce( lambda x, y: set(x) | set(y), sen_w, set()) for sen_f in self.dataset['train'].X2: seen['feats'] |= reduce( lambda x, y: set(x) | set(y), sen_f, set()) words = set(xrange(len(self.w2i))) feats = set(xrange(self.total_feats)) self.notseen = { 'words': numpy.array(sorted(words - seen['words'])), 'feats': numpy.array(sorted(feats - seen['feats'])) } def set_dataset(self, data): self._create_data_specs(data['train']) self.dataset = data self.compute_used_inputs() self.tagger.notseen = self.notseen def create_algorithm(self, data, save_best_path=None): self.set_dataset(data) self.create_adjustors() term = EpochCounter(max_epochs=self.max_epochs) if self.valid_stop: cost_crit = MonitorBased(channel_name='valid_objective', prop_decrease=.0, N=3) term = And(criteria=[cost_crit, term]) #(layers, A_weight_decay) coeffs = None if self.reg_factors: rf = self.reg_factors lhdims = len(self.tagger.hdims) l_inputlayer = len(self.tagger.layers[0].layers) coeffs = ([[rf] * l_inputlayer] + ([rf] * lhdims) + [rf], rf) cost = SeqTaggerCost(coeffs, self.dropout) self.cost = cost self.mbsb = MonitorBasedSaveBest(channel_name='valid_objective', save_path=save_best_path) mon_dataset = dict(self.dataset) if not self.monitor_train: del mon_dataset['train'] _learning_rule = (self.momentum_rule if self.use_momentum else None) self.algorithm = SGD(batch_size=1, learning_rate=self.lr, termination_criterion=term, monitoring_dataset=mon_dataset, cost=cost, learning_rule=_learning_rule, ) self.algorithm.setup(self, self.dataset['train']) if self.plot_monitor: cn = ["valid_objective", "test_objective"] if self.monitor_train: cn.append("train_objective") plots = Plots(channel_names=cn, save_path=self.plot_monitor) self.pm = PlotManager([plots], freq=1) self.pm.setup(self, None, self.algorithm) def train(self): while True: if not self.algorithm.continue_learning(self): break self.algorithm.train(dataset=self.dataset['train']) self.monitor.report_epoch() self.monitor() self.mbsb.on_monitor(self, self.dataset['valid'], self.algorithm) if self.use_momentum: self.momentum_adjustor.on_monitor(self, self.dataset['valid'], self.algorithm) if hasattr(self, 'learning_rate_adjustor'): self.learning_rate_adjustor.on_monitor( self, self.dataset['valid'], self.algorithm) if hasattr(self, 'pm'): self.pm.on_monitor( self, self.dataset['valid'], self.algorithm) def prepare_tagging(self): X = self.get_input_space().make_theano_batch(batch_size=1) Y = self.fprop(X) self.f = theano.function([X[0], X[1]], Y) self.start = self.A.get_value()[0] self.end = self.A.get_value()[1] self.A_value = self.A.get_value()[2:] def process_input(self, words, feats): return self.f(words, feats) def tag_sen(self, words, feats, debug=False, return_probs=False): if not hasattr(self, 'f'): self.prepare_tagging() y = self.process_input(words, feats) tagger_out = y[2 + self.n_classes:] res = viterbi(self.start, self.A_value, self.end, tagger_out, self.n_classes, return_probs) if return_probs: return res / res.sum(axis=1)[:,numpy.newaxis] #return res.reshape((1, len(res))) if debug: return numpy.array([[e] for e in res[1]]), tagger_out return numpy.array([[e] for e in res[1]]) def get_score(self, dataset, mode='pwp'): self.prepare_tagging() tagged = (self.tag_sen(w, f) for w, f in izip(dataset.X1, dataset.X2)) gold = dataset.y good, bad = 0., 0. if mode == 'pwp': for t, g in izip(tagged, gold): g = g.argmax(axis=1) t = t.flatten() good += sum(t == g) bad += sum(t != g) return [good / (good + bad)] elif mode == 'f1': i2t = [t for t, i in sorted(self.t2i.items(), key=lambda x: x[1])] f1c = FScCounter(i2t, binary_input=False) gold = map(lambda x:x.argmax(axis=1), gold) tagged = map(lambda x:x.flatten(), tagged) return f1c.count_score(gold, tagged) def set_embedding_weights(self, embedding_init): # load embedding with gensim from gensim.models import Word2Vec try: m = Word2Vec.load_word2vec_format(embedding_init, binary=False) edim = m.layer1_size except UnicodeDecodeError: try: m = Word2Vec.load_word2vec_format(embedding_init, binary=True) edim = m.layer1_size except UnicodeDecodeError: # not in word2vec format m = Word2Vec.load(embedding_init) edim = m.layer1_size except ValueError: # glove model m = {} if embedding_init.endswith('gz'): fp = gzip.open(embedding_init) else: fp = open(embedding_init) for l in fp: le = l.split() m[le[0].decode('utf-8')] = numpy.array( [float(e) for e in le[1:]], dtype=theano.config.floatX) edim = len(le) - 1 if edim != self.edim: raise Exception("Embedding dim and edim doesn't match") m_lower = {} vocab = (m.vocab if hasattr(m, 'vocab') else m) for k in vocab: if k in ['UNKNOWN', 'PADDING']: continue if self.num: m_lower[replace_numerals(k.lower())] = m[k] else: m_lower[k.lower()] = m[k] # transform weight matrix with using self.w2i params = numpy.zeros( self.tagger.layers[0].layers[0].get_param_vector().shape, dtype=theano.config.floatX) e = self.edim for w in self.w2i: if w in m_lower: v = m_lower[w] i = self.w2i[w] params[i*e:(i+1)*e] = v if 'UNKNOWN' in vocab: params[-1*e:] = vocab['UNKNOWN'] if 'PADDING' in vocab: params[-2*e:-1*e] = vocab['PADDING'] self.tagger.layers[0].layers[0].set_param_vector(params)
def train(d=None): train_X = np.array(d.train_X) train_y = np.array(d.train_Y) valid_X = np.array(d.valid_X) valid_y = np.array(d.valid_Y) test_X = np.array(d.test_X) test_y = np.array(d.test_Y) nb_classes = len(np.unique(train_y)) train_y = convert_one_hot(train_y) valid_y = convert_one_hot(valid_y) # train_set = RotationalDDM(X=train_X, y=train_y) train_set = DenseDesignMatrix(X=train_X, y=train_y) valid_set = DenseDesignMatrix(X=valid_X, y=valid_y) print 'Setting up' batch_size = 100 c0 = mlp.ConvRectifiedLinear( layer_name='c0', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[4, 4], pool_stride=[2, 2], # W_lr_scale=0.25, max_kernel_norm=1.9365) c1 = mlp.ConvRectifiedLinear( layer_name='c1', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[4, 4], pool_stride=[2, 2], # W_lr_scale=0.25, max_kernel_norm=1.9365) c2 = mlp.ConvRectifiedLinear( layer_name='c2', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[4, 4], pool_stride=[5, 4], W_lr_scale=0.25, # max_kernel_norm=1.9365 ) sp0 = mlp.SoftmaxPool( detector_layer_dim=16, layer_name='sp0', pool_size=4, sparse_init=512, ) sp1 = mlp.SoftmaxPool( detector_layer_dim=16, layer_name='sp1', pool_size=4, sparse_init=512, ) r0 = mlp.RectifiedLinear( layer_name='r0', dim=512, sparse_init=512, ) r1 = mlp.RectifiedLinear( layer_name='r1', dim=512, sparse_init=512, ) s0 = mlp.Sigmoid( layer_name='s0', dim=500, # max_col_norm=1.9365, sparse_init=15, ) out = mlp.Softmax( n_classes=nb_classes, layer_name='output', irange=.0, # max_col_norm=1.9365, # sparse_init=nb_classes, ) epochs = EpochCounter(100) layers = [s0, out] decay_coeffs = [.00005, .00005, .00005] in_space = Conv2DSpace( shape=[d.size, d.size], num_channels=1, ) vec_space = VectorSpace(d.size**2) nn = mlp.MLP( layers=layers, # input_space=in_space, nvis=d.size**2, # batch_size=batch_size, ) trainer = sgd.SGD( learning_rate=0.01, # cost=SumOfCosts(costs=[ # dropout.Dropout(), # MethodCost(method='cost_from_X'), # WeightDecay(decay_coeffs), # ]), # cost=MethodCost(method='cost_from_X'), batch_size=batch_size, # train_iteration_mode='even_shuffled_sequential', termination_criterion=epochs, # learning_rule=learning_rule.Momentum(init_momentum=0.5), ) trainer = bgd.BGD( batch_size=10000, line_search_mode='exhaustive', conjugate=1, updates_per_batch=10, termination_criterion=epochs, ) lr_adjustor = LinearDecayOverEpoch( start=1, saturate=10, decay_factor=.1, ) momentum_adjustor = learning_rule.MomentumAdjustor( final_momentum=.99, start=1, saturate=10, ) trainer.setup(nn, train_set) print 'Learning' test_X = vec_space.np_format_as(test_X, nn.get_input_space()) train_X = vec_space.np_format_as(train_X, nn.get_input_space()) i = 0 X = nn.get_input_space().make_theano_batch() Y = nn.fprop(X) predict = theano.function([X], Y) best = -40 best_iter = -1 while trainer.continue_learning(nn): print '--------------' print 'Training Epoch ' + str(i) trainer.train(dataset=train_set) nn.monitor() print 'Evaluating...' predictions = convert_categorical(predict(train_X[:2000])) score = accuracy_score(convert_categorical(train_y[:2000]), predictions) print 'Score on train: ' + str(score) predictions = convert_categorical(predict(test_X)) score = accuracy_score(test_y, predictions) print 'Score on test: ' + str(score) best, best_iter = (best, best_iter) if best > score else (score, i) print 'Current best: ' + str(best) + ' at iter ' + str(best_iter) print classification_report(test_y, predictions) print 'Adjusting parameters...' # momentum_adjustor.on_monitor(nn, valid_set, trainer) # lr_adjustor.on_monitor(nn, valid_set, trainer) i += 1 print ' '