コード例 #1
0
ファイル: test_sgd.py プロジェクト: AlexArgus/pylearn2
        def run_algorithm():
            unsupported_modes = ['random_slice', 'random_uniform']
            algorithm = SGD(learning_rate,
                            cost,
                            batch_size=batch_size,
                            train_iteration_mode=mode,
                            monitoring_dataset=None,
                            termination_criterion=termination_criterion,
                            update_callbacks=None,
                            init_momentum=None,
                            set_batch_size=False)

            algorithm.setup(dataset=dataset, model=model)

            raised = False
            try:
                algorithm.train(dataset)
            except ValueError:
                print mode
                assert mode in unsupported_modes
                raised = True
            if mode in unsupported_modes:
                assert raised
                return True
            return False
コード例 #2
0
ファイル: test_sgd.py プロジェクト: AlexArgus/pylearn2
def test_sgd_unspec_num_mon_batch():

    # tests that if you don't specify a number of
    # monitoring batches, SGD configures the monitor
    # to run on all the data

    m = 25

    visited = [False] * m
    rng = np.random.RandomState([25, 9, 2012])
    X = np.zeros((m, 1))
    X[:, 0] = np.arange(m)
    dataset = DenseDesignMatrix(X=X)

    model = SoftmaxModel(1)

    learning_rate = 1e-3
    batch_size = 5

    cost = DummyCost()

    algorithm = SGD(learning_rate,
                    cost,
                    batch_size=batch_size,
                    monitoring_batches=None,
                    monitoring_dataset=dataset,
                    termination_criterion=None,
                    update_callbacks=None,
                    init_momentum=None,
                    set_batch_size=False)

    algorithm.setup(dataset=dataset, model=model)

    monitor = Monitor.get_monitor(model)

    X = T.matrix()

    def tracker(*data):
        X, = data
        assert X.shape[1] == 1
        for i in xrange(X.shape[0]):
            visited[int(X[i, 0])] = True

    monitor.add_channel(name='tracker',
                        ipt=X,
                        val=0.,
                        prereqs=[tracker],
                        data_specs=(model.get_input_space(),
                                    model.get_input_source()))

    monitor()

    if False in visited:
        print visited
        assert False
コード例 #3
0
def test_sgd_unspec_num_mon_batch():

    # tests that if you don't specify a number of
    # monitoring batches, SGD configures the monitor
    # to run on all the data

    m = 25

    visited = [False] * m
    rng = np.random.RandomState([25, 9, 2012])
    X = np.zeros((m, 1))
    X[:, 0] = np.arange(m)
    dataset = DenseDesignMatrix(X=X)

    model = SoftmaxModel(1)

    learning_rate = 1e-3
    batch_size = 5

    cost = DummyCost()

    algorithm = SGD(learning_rate,
                    cost,
                    batch_size=5,
                    monitoring_batches=None,
                    monitoring_dataset=dataset,
                    termination_criterion=None,
                    update_callbacks=None,
                    init_momentum=None,
                    set_batch_size=False)

    algorithm.setup(dataset=dataset, model=model)

    monitor = Monitor.get_monitor(model)

    X = T.matrix()

    def tracker(*data):
        X, = data
        assert X.shape[1] == 1
        for i in xrange(X.shape[0]):
            visited[int(X[i, 0])] = True

    monitor.add_channel(name='tracker',
                        ipt=X,
                        val=0.,
                        prereqs=[tracker],
                        data_specs=(model.get_input_space(),
                                    model.get_input_source()))

    monitor()

    if False in visited:
        print visited
        assert False
コード例 #4
0
def test_rmsprop():
    """
    Make sure that learning_rule.RMSProp obtains the same parameter values as
    with a hand-crafted RMSProp implementation, given a dummy model and
    learning rate scaler for each parameter.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    decay = 0.90
    max_scaling = 1e5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=RMSProp(decay),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['g2'] = np.zeros(param_shape)

    def rmsprop_manual(model, state):
        inc = []
        rval = []
        epsilon = 1. / max_scaling
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin rmsprop
            pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2
            rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon)
            dx_t = - scale * learning_rate / rms_g_t * param_val
            rval += [param_val + dx_t]
        return rval

    manual = rmsprop_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))
コード例 #5
0
ファイル: test_sgd.py プロジェクト: AlexArgus/pylearn2
def test_sgd_sequential():

    # tests that requesting train_iteration_mode = 'sequential'
    # works

    dim = 1
    batch_size = 3
    m = 5 * batch_size

    dataset = ArangeDataset(m)

    model = SoftmaxModel(dim)

    learning_rate = 1e-3
    batch_size = 5

    visited = [False] * m

    def visit(X):
        assert X.shape[1] == 1
        assert np.all(X[1:] == X[0:-1]+1)
        start = int(X[0, 0])
        if start > 0:
            assert visited[start - 1]
        for i in xrange(batch_size):
            assert not visited[start+i]
            visited[start+i] = 1

    data_specs = (model.get_input_space(), model.get_input_source())
    cost = CallbackCost(visit, data_specs)

    # We need to include this so the test actually stops running at some point
    termination_criterion = EpochCounter(5)

    algorithm = SGD(learning_rate,
                    cost,
                    batch_size=batch_size,
                    train_iteration_mode='sequential',
                    monitoring_dataset=None,
                    termination_criterion=termination_criterion,
                    update_callbacks=None,
                    init_momentum=None,
                    set_batch_size=False)

    algorithm.setup(dataset=dataset, model=model)

    algorithm.train(dataset)

    assert all(visited)
コード例 #6
0
ファイル: test_sgd.py プロジェクト: rudaoshi/pylearn2
def test_lr_scalers():
    """
    Tests that SGD respects Model.get_lr_scalers
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    learning_rate = .001

    class ModelWithScalers(Model):
        def __init__(self):
            super(ModelWithScalers, self).__init__()
            self._params = [sharedX(np.zeros(shape)) for shape in shapes]
            self.input_space = VectorSpace(1)

        def __call__(self, X):
            # Implemented only so that DummyCost would work
            return X

        def get_lr_scalers(self):
            return dict(zip(self._params, scales))

    model = ModelWithScalers()

    dataset = ArangeDataset(1)

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(.0),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    manual = [
        param - learning_rate * scale for param, scale in zip(manual, scales)
    ]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))

    manual = [
        param - learning_rate * scale for param, scale in zip(manual, scales)
    ]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))
コード例 #7
0
def test_adadelta():
    """
    Make sure that learning_rule.AdaDelta obtains the same parameter values as
    with a hand-crafted AdaDelta implementation, given a dummy model and
    learning rate scaler for each parameter.

    Reference:
    "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    decay = 0.95

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaDelta(decay),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['g2'] = np.zeros(param_shape)
        state[param]['dx2'] = np.zeros(param_shape)

    def adadelta_manual(model, state):
        inc = []
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2
            rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate)
            rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate)
            dx_t = -rms_dx_tm1 / rms_g_t * param_val
            pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2
            rval += [param_val + dx_t]
        return rval

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))
コード例 #8
0
def test_train_ae():
    GC = GaussianCorruptor

    gsn = GSN.new(layer_sizes=[ds.X.shape[1], 1000],
                  activation_funcs=["sigmoid", "tanh"],
                  pre_corruptors=[None, GC(1.0)],
                  post_corruptors=[SaltPepperCorruptor(0.5),
                                   GC(1.0)],
                  layer_samplers=[BinomialSampler(), None],
                  tied=False)

    # average MBCE over example rather than sum it
    _mbce = MeanBinaryCrossEntropy()
    reconstruction_cost = lambda a, b: _mbce.cost(a, b) / ds.X.shape[1]

    c = GSNCost([(0, 1.0, reconstruction_cost)], walkback=WALKBACK)

    alg = SGD(LEARNING_RATE,
              init_momentum=MOMENTUM,
              cost=c,
              termination_criterion=EpochCounter(MAX_EPOCHS),
              batches_per_iter=BATCHES_PER_EPOCH,
              batch_size=BATCH_SIZE,
              monitoring_dataset=ds,
              monitoring_batches=10)

    trainer = Train(ds,
                    gsn,
                    algorithm=alg,
                    save_path="gsn_ae_example.pkl",
                    save_freq=5)
    trainer.main_loop()
    print "done training"
コード例 #9
0
def get_ae_pretrainer(layer, data, batch_size, epochs=30):
    init_lr = 0.05

    train_algo = SGD(
        batch_size=batch_size,
        learning_rate=init_lr,
        learning_rule=Momentum(init_momentum=0.5),
        monitoring_batches=batch_size,
        monitoring_dataset=data,
        # for ContractiveAutoencoder:
        # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()],
        #                             [0.5, cost.MethodCost(method='contraction_penalty')]]),
        # for HigherOrderContractiveAutoencoder:
        # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()],
        #                             [0.5, cost.MethodCost(method='contraction_penalty')],
        #                             [0.5, cost.MethodCost(method='higher_order_penalty')]]),
        # for DenoisingAutoencoder:
        cost=MeanSquaredReconstructionError(),
        termination_criterion=EpochCounter(epochs))
    return Train(model=layer,
                 algorithm=train_algo,
                 dataset=data,
                 extensions=[
                     MomentumAdjustor(final_momentum=0.9, start=0,
                                      saturate=25),
                     LinearDecayOverEpoch(start=1,
                                          saturate=25,
                                          decay_factor=.02)
                 ])
コード例 #10
0
ファイル: test_train.py プロジェクト: yo-ga/TextDetector
def test_execution_order():

    # ensure save is called directly after monitoring by checking
    # parameter values in `on_monitor` and `on_save`.

    model = MLP(layers=[Softmax(layer_name='y', n_classes=2, irange=0.)],
                nvis=3)

    dataset = DenseDesignMatrix(X=np.random.normal(size=(6, 3)),
                                y=np.random.normal(size=(6, 2)))

    epoch_counter = EpochCounter(max_epochs=1)

    algorithm = SGD(batch_size=2,
                    learning_rate=0.1,
                    termination_criterion=epoch_counter)

    extension = ParamMonitor()

    train = Train(dataset=dataset,
                  model=model,
                  algorithm=algorithm,
                  extensions=[extension],
                  save_freq=1,
                  save_path="save.pkl")

    # mock save
    train.save = MethodType(only_run_extensions, train)

    train.main_loop()
コード例 #11
0
    def get_train_sgd(self, config_id):
        row = self.db.executeSQL(
            """
        SELECT  learning_rate,batch_size,init_momentum,
                train_iteration_mode,cost_array,term_array
        FROM hps3.train_sgd
        WHERE config_id = %s
        """, (config_id, ), self.db.FETCH_ONE)
        if not row or row is None:
            raise HPSData("No stochasticGradientDescent for config_id="\
                +str(config_id))
        (learning_rate, batch_size, init_momentum, train_iteration_mode,
         cost_array, term_array) = row
        # cost
        cost = self.get_costs(cost_array)

        num_train_batch = (self.ntrain / self.batch_size)
        print "num training batches:", num_train_batch
        termination_criterion \
            = self.get_terminations(config_id, term_array)
        return SGD(learning_rate=learning_rate,
                   cost=cost,
                   batch_size=batch_size,
                   batches_per_iter=num_train_batch,
                   monitoring_dataset=self.monitoring_dataset,
                   termination_criterion=termination_criterion,
                   init_momentum=init_momentum,
                   train_iteration_mode=train_iteration_mode)
コード例 #12
0
ファイル: training.py プロジェクト: albertomontesg/deeplearn
def train_model():
    global ninput, noutput
    simdata = SimulationData(
        sim_path="../../javaDataCenter/generarDadesV1/CA_SDN_topo1/")
    simdata.load_data()
    simdata.preprocessor()
    dataset = simdata.get_matrix()

    structure = get_structure()
    layers = []
    for pair in structure:
        layers.append(get_autoencoder(pair))

    model = DeepComposedAutoencoder(layers)
    training_alg = SGD(learning_rate=1e-3,
                       cost=MeanSquaredReconstructionError(),
                       batch_size=1296,
                       monitoring_dataset=dataset,
                       termination_criterion=EpochCounter(max_epochs=50))
    extensions = [MonitorBasedLRAdjuster()]
    experiment = Train(dataset=dataset,
                       model=model,
                       algorithm=training_alg,
                       save_path='training2.pkl',
                       save_freq=10,
                       allow_overwrite=True,
                       extensions=extensions)
    experiment.main_loop()
コード例 #13
0
ファイル: model_convnet.py プロジェクト: gaoch023/kaggle
def get_trainer(model, trainset, validset, epochs=20, batch_size=200):
    monitoring_batches = None if validset is None else 20
    train_algo = SGD(batch_size=batch_size,
                     init_momentum=0.5,
                     learning_rate=0.1,
                     monitoring_batches=monitoring_batches,
                     monitoring_dataset=validset,
                     cost=Dropout(input_include_probs={
                         'h0': 0.8,
                         'h1': 0.8,
                         'h2': 0.8,
                         'h3': 0.8,
                         'y': 0.5
                     },
                                  input_scales={
                                      'h0': 1. / 0.8,
                                      'h1': 1. / 0.8,
                                      'h2': 1. / 0.8,
                                      'h3': 1. / 0.8,
                                      'y': 1. / 0.5
                                  },
                                  default_input_include_prob=0.5,
                                  default_input_scale=1. / 0.5),
                     termination_criterion=EpochCounter(epochs),
                     update_callbacks=ExponentialDecay(decay_factor=1.0001,
                                                       min_lr=0.001))
    return Train(model=model, algorithm=train_algo, dataset=trainset, save_freq=0, save_path='epoch', \
            extensions=[MomentumAdjustor(final_momentum=0.9, start=0, saturate=int(epochs*0.8)), ])
コード例 #14
0
    def get_train_sgd(self):

        cost = MethodCost('cost_from_X')
        #cost = self.get_costs()
        num_train_batch = (self.ntrain/self.batch_size)
        print "num training batches:", num_train_batch

        termination_criterion = self.get_terminations()

        monitoring_dataset = {}
        for dataset_id in self.state.monitoring_dataset:
            if dataset_id == 'test' and self.test_ddm is not None:
                monitoring_dataset['test'] = self.test_ddm
            elif dataset_id == 'valid' and self.valid_ddm is not None:
                monitoring_dataset['valid'] = self.valid_ddm
            else:
                monitoring_dataset = None
            
        return SGD( learning_rate=self.state.learning_rate,
                    batch_size=self.state.batch_size,
                    cost=cost,
                    batches_per_iter=num_train_batch,
                    monitoring_dataset=monitoring_dataset,
                    termination_criterion=termination_criterion,
                    init_momentum=self.state.init_momentum,
                    train_iteration_mode=self.state.train_iteration_mode)
コード例 #15
0
    def __init__(self,
                 runner,
                 model_params,
                 resume=False,
                 resume_data=None,
                 s3_data=None,
                 **kwargs):
        dataset = create_dense_design_matrix(x=runner.dp.train_set_x)

        if resume:
            model, model_params = self.resume_model(model_params, resume_data)
        else:
            model = self.new_model(model_params, dataset=dataset)

        termination_criterion = MaxEpochNumber(model_params['maxnum_iter'])
        algorithm = SGD(learning_rate=model_params['learning_rate']['init'],
                        monitoring_dataset=dataset,
                        cost=MeanSquaredReconstructionError(),
                        termination_criterion=termination_criterion,
                        batch_size=model_params['batch_size'])
        ext = AutoEncoderStatReporter(runner,
                                      resume=resume,
                                      resume_data=resume_data,
                                      save_freq=model_params['save_freq'])
        self.train_obj = Train(dataset=dataset,
                               model=model,
                               algorithm=algorithm,
                               extensions=[ext])
コード例 #16
0
def test_multiple_inputs():
    """
    Create a VectorSpacesDataset with two inputs (features0 and features1)
    and train an MLP which takes both inputs for 1 epoch.
    """
    mlp = MLP(layers=[
        FlattenerLayer(
            CompositeLayer('composite',
                           [Linear(10, 'h0', 0.1),
                            Linear(10, 'h1', 0.1)], {
                                0: [1],
                                1: [0]
                            })),
        Softmax(5, 'softmax', 0.1)
    ],
              input_space=CompositeSpace([VectorSpace(15),
                                          VectorSpace(20)]),
              input_source=('features0', 'features1'))
    dataset = VectorSpacesDataset(
        (np.random.rand(20, 20).astype(theano.config.floatX),
         np.random.rand(20, 15).astype(theano.config.floatX),
         np.random.rand(20, 5).astype(theano.config.floatX)),
        (CompositeSpace(
            [VectorSpace(20), VectorSpace(15),
             VectorSpace(5)]), ('features1', 'features0', 'targets')))
    train = Train(dataset, mlp, SGD(0.1, batch_size=5))
    train.algorithm.termination_criterion = EpochCounter(1)
    train.main_loop()
コード例 #17
0
def train_example(dataset=None):
    model = GaussianBinaryRBM(nvis=1296,
                              nhid=61,
                              irange=0.5,
                              energy_function_class=grbm_type_1(),
                              learn_sigma=True,
                              init_sigma=.4,
                              init_bias_hid=2.,
                              mean_vis=False,
                              sigma_lr_scale=1e-3)
    cost = SMD(corruptor=GaussianCorruptor(stdev=0.4))
    algorithm = SGD(learning_rate=.1,
                    batch_size=5,
                    monitoring_batches=20,
                    monitoring_dataset=dataset,
                    cost=cost,
                    termination_criterion=MonitorBased(prop_decrease=0.01,
                                                       N=1))
    train = Train(dataset=dataset,
                  model=model,
                  save_path="./experiment/training.pkl",
                  save_freq=10,
                  algorithm=algorithm,
                  extensions=[])
    train.main_loop()
コード例 #18
0
def test_sgd_sup():

    # tests that we can run the sgd algorithm
    # on a supervised cost.
    # does not test for correctness at all, just
    # that the algorithm runs without dying

    dim = 3
    m = 10

    rng = np.random.RandomState([25, 9, 2012])

    X = rng.randn(m, dim)

    idx = rng.randint(0, dim, (m, ))
    Y = np.zeros((m, dim))
    for i in xrange(m):
        Y[i, idx[i]] = 1

    dataset = DenseDesignMatrix(X=X, y=Y)

    m = 15
    X = rng.randn(m, dim)

    idx = rng.randint(0, dim, (m,))
    Y = np.zeros((m, dim))
    for i in xrange(m):
        Y[i, idx[i]] = 1

    # Including a monitoring dataset lets us test that
    # the monitor works with supervised data
    monitoring_dataset = DenseDesignMatrix(X=X, y=Y)

    model = SoftmaxModel(dim)

    learning_rate = 1e-3
    batch_size = 5

    cost = SupervisedDummyCost()

    # We need to include this so the test actually stops running at some point
    termination_criterion = EpochCounter(5)

    algorithm = SGD(learning_rate, cost,
                    batch_size=batch_size,
                    monitoring_batches=3,
                    monitoring_dataset=monitoring_dataset,
                    termination_criterion=termination_criterion,
                    update_callbacks=None,
                    init_momentum=None,
                    set_batch_size=False)

    train = Train(dataset,
                  model,
                  algorithm,
                  save_path=None,
                  save_freq=0,
                  extensions=None)

    train.main_loop()
コード例 #19
0
    def train_with_monitoring_datasets(train_dataset,
                                       monitoring_datasets,
                                       model_force_batch_size,
                                       train_iteration_mode,
                                       monitor_iteration_mode):

        model = SoftmaxModel(dim)
        if model_force_batch_size:
            model.force_batch_size = model_force_batch_size

        cost = DummyCost()

        algorithm = SGD(learning_rate, cost,
                        batch_size=batch_size,
                        train_iteration_mode=train_iteration_mode,
                        monitor_iteration_mode=monitor_iteration_mode,
                        monitoring_dataset=monitoring_datasets,
                        termination_criterion=EpochCounter(2))

        train = Train(train_dataset,
                      model,
                      algorithm,
                      save_path=None,
                      save_freq=0,
                      extensions=None)

        train.main_loop()
コード例 #20
0
def get_layer_trainer_sgd_autoencoder(layer,
                                      trainset,
                                      batch_size=10,
                                      learning_rate=0.1,
                                      max_epochs=100,
                                      name=''):
    # configs on sgd
    train_algo = SGD(
        learning_rate=learning_rate,
        #             learning_rule = AdaDelta(),
        learning_rule=Momentum(init_momentum=0.5),
        cost=MeanSquaredReconstructionError(),
        batch_size=batch_size,
        monitoring_dataset=trainset,
        termination_criterion=EpochCounter(max_epochs=max_epochs),
        update_callbacks=None)

    log_callback = LoggingCallback(name)

    return Train(model=layer,
                 algorithm=train_algo,
                 extensions=[
                     log_callback,
                     OneOverEpoch(start=1, half_life=5),
                     MomentumAdjustor(final_momentum=0.7,
                                      start=10,
                                      saturate=100)
                 ],
                 dataset=trainset)
コード例 #21
0
ファイル: test_pylearn2.py プロジェクト: jych/blocks
def test_pylearn2_trainin():
    # Construct the model
    mlp = MLP(activations=[Sigmoid(), Sigmoid()],
              dims=[784, 100, 784],
              weights_init=IsotropicGaussian(),
              biases_init=Constant(0.01))
    mlp.initialize()
    cost = SquaredError()

    block_cost = BlocksCost(cost)
    block_model = BlocksModel(mlp, (VectorSpace(dim=784), 'features'))

    # Load the data
    rng = numpy.random.RandomState(14)
    train_dataset = random_dense_design_matrix(rng, 1024, 784, 10)
    valid_dataset = random_dense_design_matrix(rng, 1024, 784, 10)

    # Silence Pylearn2's logger
    logger = logging.getLogger(pylearn2.__name__)
    logger.setLevel(logging.ERROR)

    # Training algorithm
    sgd = SGD(learning_rate=0.01,
              cost=block_cost,
              batch_size=128,
              monitoring_dataset=valid_dataset)
    train = Train(train_dataset, block_model, algorithm=sgd)
    train.main_loop(time_budget=3)
コード例 #22
0
ファイル: main.py プロジェクト: hycis/conditional_computation
def model1():
    #pdb.set_trace()
    # train set X has dim (60,000, 784), y has dim (60,000, 10)
    train_set = MNIST(which_set='train', one_hot=True)
    # test set X has dim (10,000, 784), y has dim (10,000, 10)
    valid_set = MNIST(which_set='test', one_hot=True)
    test_set = MNIST(which_set='test', one_hot=True)

    #import pdb
    #pdb.set_trace()
    #print train_set.X.shape[1]

    # =====<Create the MLP Model>=====

    h2_layer = NoisyRELU(layer_name='h1',
                         sparse_init=15,
                         noise_factor=5,
                         dim=1000,
                         desired_active_rate=0.2,
                         bias_factor=20,
                         max_col_norm=1)
    #h2_layer = RectifiedLinear(layer_name='h2', dim=100, sparse_init=15, max_col_norm=1)
    #print h1_layer.get_params()
    #h2 = RectifiedLinear(layer_name='h2', dim=500, sparse_init=15, max_col_norm=1)
    y_layer = Softmax(layer_name='y', n_classes=10, irange=0., max_col_norm=1)

    mlp = MLP(batch_size=200,
              input_space=VectorSpace(dim=train_set.X.shape[1]),
              layers=[h2_layer, y_layer])

    # =====<Create the SGD algorithm>=====
    sgd = SGD(init_momentum=0.1,
              learning_rate=0.01,
              monitoring_dataset={'valid': valid_set},
              cost=MethodCost('cost_from_X'),
              termination_criterion=MonitorBased(
                  channel_name='valid_y_misclass', prop_decrease=0.001, N=50))
    #sgd.setup(model=mlp, dataset=train_set)

    # =====<Extensions>=====
    ext = [MomentumAdjustor(start=1, saturate=10, final_momentum=0.9)]

    # =====<Create Training Object>=====
    save_path = './mlp_model1.pkl'
    train_obj = Train(dataset=train_set,
                      model=mlp,
                      algorithm=sgd,
                      extensions=ext,
                      save_path=save_path,
                      save_freq=0)
    #train_obj.setup_extensions()

    #import pdb
    #pdb.set_trace()
    train_obj.main_loop()

    # =====<Run the training>=====
    '''
コード例 #23
0
    def create_algorithm(self, data, save_best_path=None):
        self.set_dataset(data)
        self.create_adjustors()
        term = EpochCounter(max_epochs=self.max_epochs)
        if self.valid_stop:
            cost_crit = MonitorBased(channel_name='valid_objective',
                                     prop_decrease=.0,
                                     N=3)
            term = And(criteria=[cost_crit, term])

        #(layers, A_weight_decay)
        coeffs = None
        if self.reg_factors:
            rf = self.reg_factors
            lhdims = len(self.tagger.hdims)
            l_inputlayer = len(self.tagger.layers[0].layers)
            coeffs = ([[rf] * l_inputlayer] + ([rf] * lhdims) + [rf], rf)
        cost = SeqTaggerCost(coeffs, self.dropout)
        self.cost = cost

        self.mbsb = MonitorBasedSaveBest(channel_name='valid_objective',
                                         save_path=save_best_path)
        mon_dataset = dict(self.dataset)
        if not self.monitor_train:
            del mon_dataset['train']

        _learning_rule = (self.momentum_rule if self.use_momentum else None)
        self.algorithm = SGD(
            batch_size=1,
            learning_rate=self.lr,
            termination_criterion=term,
            monitoring_dataset=mon_dataset,
            cost=cost,
            learning_rule=_learning_rule,
        )

        self.algorithm.setup(self, self.dataset['train'])
        if self.plot_monitor:
            cn = ["valid_objective", "test_objective"]
            if self.monitor_train:
                cn.append("train_objective")
            plots = Plots(channel_names=cn, save_path=self.plot_monitor)
            self.pm = PlotManager([plots], freq=1)
            self.pm.setup(self, None, self.algorithm)
コード例 #24
0
def test_adagrad():
    """
    Make sure that learning_rule.AdaGrad obtains the same parameter values as
    with a hand-crafted AdaGrad implementation, given a dummy model and
    learning rate scaler for each parameter.

    Reference:
    "Adaptive subgradient methods for online learning and
    stochastic optimization", Duchi J, Hazan E, Singer Y.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaGrad(),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['sg2'] = np.zeros(param_shape)

    def adagrad_manual(model, state):
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['sg2'] += param_val**2
            dx_t = -(scale * learning_rate / np.sqrt(pstate['sg2']) *
                     param_val)
            rval += [param_val + dx_t]
        return rval

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))
コード例 #25
0
ファイル: test_sgd.py プロジェクト: capybaralet/current
def test_lr_scalers_momentum():
    """
    Tests that SGD respects Model.get_lr_scalers when using
    momentum.
    """

    cost = SumOfParams()

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    learning_rate = .001

    class ModelWithScalers(Model):
        def __init__(self):
            self._params = [sharedX(np.zeros(shape)) for shape in shapes]
            self.input_space = VectorSpace(1)

        def get_lr_scalers(self):
            return dict(zip(self._params, scales))

    model = ModelWithScalers()

    dataset = ArangeDataset(1)

    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              init_momentum=momentum,
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    inc = [-learning_rate * scale for param, scale in zip(manual, scales)]
    manual = [param + i for param, i in zip(manual, inc)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))

    manual = [
        param - learning_rate * scale + i * momentum
        for param, scale, i in zip(manual, scales, inc)
    ]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in zip(manual, model.get_params()))
コード例 #26
0
def test_adadelta():
    """
    Make sure that learning_rule.AdaDelta obtains the same parameter values as
    with a hand-crafted AdaDelta implementation, given a dummy model and
    learning rate scaler for each parameter.

    Reference:
    "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    decay = 0.95

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaDelta(decay),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['g2'] = np.zeros(param_shape)
        state[param]['dx2'] = np.zeros(param_shape)

    def adadelta_manual(model, state):
        inc = []
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2
            rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate)
            rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate)
            dx_t = -rms_dx_tm1 / rms_g_t * param_val
            pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t ** 2
            rval += [param_val + dx_t]
        return rval

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))
コード例 #27
0
ファイル: nnlm.py プロジェクト: katalin-pajkossy/hunvec
 def create_algorithm(self):
     cost_crit = MonitorBased(channel_name=self.optimize_for,
                              prop_decrease=0.,
                              N=10)
     epoch_cnt_crit = EpochCounter(max_epochs=self.max_epochs)
     term = And(criteria=[cost_crit, epoch_cnt_crit])
     self.algorithm = SGD(batch_size=100,
                          learning_rate=.01,
                          monitoring_dataset=self.alg_datasets,
                          termination_criterion=term)
コード例 #28
0
def test_train_supervised():
    """
    Train a supervised GSN.
    """
    # initialize the GSN
    gsn = GSN.new(
        layer_sizes=[ds.X.shape[1], 1000, ds.y.shape[1]],
        activation_funcs=["sigmoid", "tanh", rescaled_softmax],
        pre_corruptors=[GaussianCorruptor(0.5)] * 3,
        post_corruptors=[
            SaltPepperCorruptor(.3), None,
            SmoothOneHotCorruptor(.5)
        ],
        layer_samplers=[BinomialSampler(), None,
                        MultinomialSampler()],
        tied=False)

    # average over costs rather than summing
    _rcost = MeanBinaryCrossEntropy()
    reconstruction_cost = lambda a, b: _rcost.cost(a, b) / ds.X.shape[1]

    _ccost = MeanBinaryCrossEntropy()
    classification_cost = lambda a, b: _ccost.cost(a, b) / ds.y.shape[1]

    # combine costs into GSNCost object
    c = GSNCost(
        [
            # reconstruction on layer 0 with weight 1.0
            (0, 1.0, reconstruction_cost),

            # classification on layer 2 with weight 2.0
            (2, 2.0, classification_cost)
        ],
        walkback=WALKBACK,
        mode="supervised")

    alg = SGD(
        LEARNING_RATE,
        init_momentum=MOMENTUM,
        cost=c,
        termination_criterion=EpochCounter(MAX_EPOCHS),
        batches_per_iter=BATCHES_PER_EPOCH,
        batch_size=BATCH_SIZE,
        monitoring_dataset=ds,
        monitoring_batches=10,
    )

    trainer = Train(ds,
                    gsn,
                    algorithm=alg,
                    save_path="gsn_sup_example.pkl",
                    save_freq=10,
                    extensions=[MonitorBasedLRAdjuster()])
    trainer.main_loop()
    print("done training")
コード例 #29
0
def testing_multiple_datasets_with_specified_dataset_in_monitor_based_lr():
    # tests that the class MonitorBasedLRAdjuster in sgd.py can properly use
    # the spcified dataset_name in the constructor when multiple datasets
    # exist.

    dim = 3
    m = 10

    rng = np.random.RandomState([06, 02, 2014])

    X = rng.randn(m, dim)
    Y = rng.randn(m, dim)

    learning_rate = 1e-2
    batch_size = 5

    # We need to include this so the test actually stops running at some point
    epoch_num = 1

    # including a monitoring datasets lets us test that
    # the monitor works with supervised data
    monitoring_train = DenseDesignMatrix(X=X)
    monitoring_test = DenseDesignMatrix(X=Y)

    cost = DummyCost()

    model = SoftmaxModel(dim)

    dataset = DenseDesignMatrix(X=X)

    termination_criterion = EpochCounter(epoch_num)

    monitoring_dataset = {'train': monitoring_train, 'test': monitoring_test}

    algorithm = SGD(learning_rate,
                    cost,
                    batch_size=batch_size,
                    monitoring_batches=2,
                    monitoring_dataset=monitoring_dataset,
                    termination_criterion=termination_criterion,
                    update_callbacks=None,
                    init_momentum=None,
                    set_batch_size=False)

    dataset_name = monitoring_dataset.keys()[0]
    monitor_lr = MonitorBasedLRAdjuster(dataset_name=dataset_name)

    train = Train(dataset,
                  model,
                  algorithm,
                  save_path=None,
                  save_freq=0,
                  extensions=[monitor_lr])

    train.main_loop()
コード例 #30
0
ファイル: test_sgd.py プロジェクト: allansp84/pylearn2
def test_lr_scalers():
    """
    Tests that SGD respects Model.get_lr_scalers
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    learning_rate = .001

    class ModelWithScalers(Model):
        def __init__(self):
            super(ModelWithScalers, self).__init__()
            self._params = [sharedX(np.zeros(shape)) for shape in shapes]
            self.input_space = VectorSpace(1)

        def __call__(self, X):
            # Implemented only so that DummyCost would work
            return X

        def get_lr_scalers(self):
            return dict(zip(self._params, scales))

    model = ModelWithScalers()

    dataset = ArangeDataset(1)

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(.0),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    manual = [param - learning_rate * scale for param, scale in
              zip(manual, scales)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in zip(manual, model.get_params()))

    manual = [param - learning_rate * scale
              for param, scale
              in zip(manual, scales)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in zip(manual, model.get_params()))
コード例 #31
0
    def set_training_criteria(self,
                              learning_rate=0.05,
                              cost=Default(),
                              batch_size=10,
                              max_epochs=10):

        self.training_alg = SGD(learning_rate=learning_rate,
                                cost=cost,
                                batch_size=batch_size,
                                monitoring_dataset=self.datasets,
                                termination_criterion=EpochCounter(max_epochs))
コード例 #32
0
	def set_training_criteria(self, 
							learning_rate=0.05, 
							cost=MeanSquaredReconstructionError(), 
							batch_size=10, 
							max_epochs=10):
		
		self.training_alg = SGD(learning_rate = learning_rate, 
								cost = cost, 
								batch_size = batch_size, 
								monitoring_dataset = self.datasets, 
								termination_criterion = EpochCounter(max_epochs))
コード例 #33
0
def test_nesterov_momentum():
    """
    Make sure that learning_rule.Momentum obtains the same parameter values as
    with a hand-crafted sgd w/ momentum implementation, given a dummy model and
    learning rate scaler for each parameter.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum, nesterov_momentum=True),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    vel = [-learning_rate * scale for scale in scales]
    updates = [
        -learning_rate * scale + v * momentum
        for scale, v in izip(scales, vel)
    ]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))

    vel = [
        -learning_rate * scale + i * momentum
        for scale, i in izip(scales, vel)
    ]
    updates = [
        -learning_rate * scale + v * momentum
        for scale, v in izip(scales, vel)
    ]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))
コード例 #34
0
def prepare_adagrad_test(dataset_type='arange', model_type='random'):
    """
    Factor out common code for AdaGrad tests.
    Parameters
    ----------
    dataset_type : string, optional
        Can use either `arange` to use an ArangeDataset instance or
        `zeros` to create an all-zeros DenseDesignMatrix.
    model_type : string, optional
        How to initialize the model; `random` will initialize parameters
        to random values, `zeros` to zero.
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales, init_type=model_type)
    if dataset_type == 'arange':
        dataset = ArangeDataset(1)
    elif dataset_type == 'zeros':
        X = np.zeros((1, 1))
        X[:, 0] = np.arange(1)
        dataset = DenseDesignMatrix(X)
    else:
        raise ValueError('Unknown value for dataset_type: %s',
                         dataset_type)

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaGrad(),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['sg2'] = np.zeros(param_shape)

    return (cost, model, dataset, sgd, state)
コード例 #35
0
    def __init__(self,
                 runner,
                 model_params,
                 resume=False,
                 resume_data=None,
                 s3_data=None,
                 **kwargs):
        self.model_params = model_params
        self.out_nonlin = runner.model['out_nonlin']
        if self.out_nonlin == 'LINEARGAUSSIAN':
            outputs_num = None
            cost = None
        else:
            outputs_num = runner.dp.uniq_outputs_num
            cost = self.get_cost_fn()
        dataset = self.construct_datasets(runner.dp.train_set_x,
                                          runner.dp.train_set_y, outputs_num)
        valid_dataset = self.construct_datasets(runner.dp.test_set_x,
                                                runner.dp.test_set_y,
                                                outputs_num)
        if resume:
            model = self.resume_model(model_params, resume_data)
            lr_init = model_params['learning_rate']['init'] / (
                model_params['learning_rate']['decay_factor']**
                model.monitor.get_batches_seen())
        else:
            model = self.new_model(model_params, dataset=dataset)
            lr_init = model_params['learning_rate']['init']

        batches_per_iter = get_batches_per_iter(model_params, dataset)
        termination_criterion = MaxEpochNumber(model_params['maxnum_iter'])
        update_callbacks, extensions = construct_update(
            model_params, resume, resume_data)
        algorithm = SGD(learning_rate=lr_init,
                        init_momentum=model_params['momentum']['init'],
                        monitoring_dataset={
                            'valid': valid_dataset,
                            'train': dataset
                        },
                        cost=cost,
                        termination_criterion=termination_criterion,
                        update_callbacks=update_callbacks,
                        batches_per_iter=batches_per_iter)
        self.train_obj = Train(dataset=dataset,
                               model=model,
                               algorithm=algorithm,
                               extensions=extensions)
        ext = MLPStatReporter(model,
                              runner,
                              resume=resume,
                              resume_data=resume_data,
                              save_freq=model_params['save_freq'])
        self.train_obj.extensions.append(ext)
コード例 #36
0
ファイル: main.py プロジェクト: hycis/conditional_computation
def model2():
    #pdb.set_trace()
    # train set X has dim (60,000, 784), y has dim (60,000, 10)
    train_set = MNIST(which_set='train', one_hot=True)
    # test set X has dim (10,000, 784), y has dim (10,000, 10)
    test_set = MNIST(which_set='test', one_hot=True)

    # =====<Create the MLP Model>=====

    h1_layer = RectifiedLinear(layer_name='h1', dim=1000, irange=0.5)
    #print h1_layer.get_params()
    h2_layer = RectifiedLinear(layer_name='h2',
                               dim=1000,
                               sparse_init=15,
                               max_col_norm=1)
    y_layer = Softmax(layer_name='y',
                      n_classes=train_set.y.shape[1],
                      irange=0.5)

    mlp = MLP(batch_size=100,
              input_space=VectorSpace(dim=train_set.X.shape[1]),
              layers=[h1_layer, h2_layer, y_layer])

    # =====<Create the SGD algorithm>=====
    sgd = SGD(batch_size=100,
              init_momentum=0.1,
              learning_rate=0.01,
              monitoring_dataset={
                  'valid': train_set,
                  'test': test_set
              },
              cost=SumOfCosts(costs=[
                  MethodCost('cost_from_X'),
                  WeightDecay(coeffs=[0.00005, 0.00005, 0.00005])
              ]),
              termination_criterion=MonitorBased(
                  channel_name='valid_y_misclass', prop_decrease=0.0001, N=5))
    #sgd.setup(model=mlp, dataset=train_set)

    # =====<Extensions>=====
    ext = [MomentumAdjustor(start=1, saturate=10, final_momentum=0.99)]

    # =====<Create Training Object>=====
    save_path = './mlp_model2.pkl'
    train_obj = Train(dataset=train_set,
                      model=mlp,
                      algorithm=sgd,
                      extensions=ext,
                      save_path=save_path,
                      save_freq=0)
    #train_obj.setup_extensions()

    train_obj.main_loop()
コード例 #37
0
        def run_algorithm():
            unsupported_modes = ['random_slice', 'random_uniform']
            algorithm = SGD(learning_rate,
                            cost,
                            batch_size=5,
                            train_iteration_mode=mode,
                            monitoring_dataset=None,
                            termination_criterion=termination_criterion,
                            update_callbacks=None,
                            init_momentum=None,
                            set_batch_size=False)

            algorithm.setup(dataset=dataset, model=model)

            raised = False
            try:
                algorithm.train(dataset)
            except ValueError:
                print mode
                assert mode in unsupported_modes
                raised = True
            if mode in unsupported_modes:
                assert raised
                return True
            return False
コード例 #38
0
def test_adagrad():
    """
    Make sure that learning_rule.AdaGrad obtains the same parameter values as
    with a hand-crafted AdaGrad implementation, given a dummy model and
    learning rate scaler for each parameter.

    Reference:
    "Adaptive subgradient methods for online learning and
    stochastic optimization", Duchi J, Hazan E, Singer Y.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaGrad(),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['sg2'] = np.zeros(param_shape)

    def adagrad_manual(model, state):
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['sg2'] += param_val ** 2
            dx_t = - (scale * learning_rate
                      / np.sqrt(pstate['sg2'])
                      * param_val)
            rval += [param_val + dx_t]
        return rval

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))
コード例 #39
0
ファイル: test_sgd.py プロジェクト: alito/pylearn2
def test_lr_scalers_momentum():
    """
    Tests that SGD respects Model.get_lr_scalers when using
    momentum.
    """

    cost = SumOfParams()

    scales = [ .01, .02, .05, 1., 5. ]
    shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    learning_rate = .001

    class ModelWithScalers(Model):
        def __init__(self):
            self._params = [sharedX(np.zeros(shape)) for shape in shapes]
            self.input_space = VectorSpace(1)

        def get_lr_scalers(self):
            return dict(zip(self._params, scales))

    model = ModelWithScalers()

    dataset = ArangeDataset(1)

    momentum = 0.5

    sgd = SGD(cost=cost, learning_rate=learning_rate, init_momentum=momentum,
            batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    inc = [ - learning_rate * scale for param, scale in
            zip(manual, scales)]
    manual = [param + i for param, i in zip(manual, inc)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param,
            sgd_param in zip(manual, model.get_params()))

    manual = [param - learning_rate * scale + i * momentum for param, scale, i in
            zip(manual, scales, inc)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param,
            sgd_param in zip(manual, model.get_params()))
コード例 #40
0
def test_nesterov_momentum():
    """
    Make sure that learning_rule.Momentum obtains the same parameter values as
    with a hand-crafted sgd w/ momentum implementation, given a dummy model and
    learning rate scaler for each parameter.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum, nesterov_momentum=True),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    vel = [-learning_rate * scale for scale in scales]
    updates = [-learning_rate * scale + v * momentum
               for scale, v in izip(scales, vel)]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))

    vel = [-learning_rate * scale + i * momentum
           for scale, i in izip(scales, vel)]
    updates = [-learning_rate * scale + v * momentum
               for scale, v in izip(scales, vel)]
    manual = [param + update for param, update in izip(manual, updates)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in izip(manual, model.get_params()))
コード例 #41
0
def test_momentum():
    """
    Make sure that learning_rule.Momentum obtains the same parameter values as
    with a hand-crafted sgd w/ momentum implementation, given a dummy model and
    learning rate scaler for each parameter.
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=Momentum(momentum),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    inc = [-learning_rate * scale for scale in scales]
    manual = [param + i for param, i in izip(manual, inc)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))

    manual = [param - learning_rate * scale + i * momentum
              for param, scale, i in izip(manual, scales, inc)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param in
               izip(manual, model.get_params()))
コード例 #42
0
ファイル: test_sgd.py プロジェクト: AlexArgus/pylearn2
def test_lr_scalers_momentum():
    """
    Tests that SGD respects Model.get_lr_scalers when using
    momentum.
    """
    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfParams(), (0., DummyCost())])

    scales = [.01, .02, .05, 1., 5.]
    shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)]

    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    learning_rate = .001
    momentum = 0.5

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              init_momentum=momentum,
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    manual = [param.get_value() for param in model.get_params()]
    inc = [-learning_rate * scale for param, scale in zip(manual, scales)]
    manual = [param + i for param, i in zip(manual, inc)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in zip(manual, model.get_params()))

    manual = [param - learning_rate * scale + i * momentum
              for param, scale, i in
              zip(manual, scales, inc)]

    sgd.train(dataset=dataset)

    assert all(np.allclose(manual_param, sgd_param.get_value())
               for manual_param, sgd_param
               in zip(manual, model.get_params()))
コード例 #43
0
ファイル: sequence_tagger.py プロジェクト: zseder/hunvec
    def create_algorithm(self, data, save_best_path=None):
        self.set_dataset(data)
        self.create_adjustors()
        term = EpochCounter(max_epochs=self.max_epochs)
        if self.valid_stop:
            cost_crit = MonitorBased(channel_name='valid_objective',
                                     prop_decrease=.0, N=3)
            term = And(criteria=[cost_crit, term])

        #(layers, A_weight_decay)
        coeffs = None
        if self.reg_factors:
            rf = self.reg_factors
            lhdims = len(self.tagger.hdims)
            l_inputlayer = len(self.tagger.layers[0].layers)
            coeffs = ([[rf] * l_inputlayer] + ([rf] * lhdims) + [rf], rf)
        cost = SeqTaggerCost(coeffs, self.dropout)
        self.cost = cost

        self.mbsb = MonitorBasedSaveBest(channel_name='valid_objective',
                                         save_path=save_best_path)
        mon_dataset = dict(self.dataset)
        if not self.monitor_train:
            del mon_dataset['train']

        _learning_rule = (self.momentum_rule if self.use_momentum else None)
        self.algorithm = SGD(batch_size=1, learning_rate=self.lr,
                             termination_criterion=term,
                             monitoring_dataset=mon_dataset,
                             cost=cost,
                             learning_rule=_learning_rule,
                             )

        self.algorithm.setup(self, self.dataset['train'])
        if self.plot_monitor:
            cn = ["valid_objective", "test_objective"]
            if self.monitor_train:
                cn.append("train_objective")
            plots = Plots(channel_names=cn, save_path=self.plot_monitor)
            self.pm = PlotManager([plots], freq=1)
            self.pm.setup(self, None, self.algorithm)
コード例 #44
0
def create_algorithm(mlp, train_set):
    rng = RandomState(hash('tobipuma') % 4294967295)
    algorithm = SGD(batch_size=20, learning_rate=0.1)
    algorithm.rng = rng #try to always have same results for algorithm
    algorithm.setup(mlp, train_set)
    return algorithm
コード例 #45
0
ファイル: junk3.py プロジェクト: caomw/Deep_wrapper
def runDeepLearning2():
    ### Loading training set and separting it into training set and testing set

    myDataset = Dataset("/home/Stephen/Desktop/Bird/DLearn/Data/Emotion_small/")
    preprocess = 0
    datasets = myDataset.loadTrain(preprocessFLAG=preprocess, flipFLAG=3)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]

    dataset_test = myDataset.loadTest(preprocess)
    test_set_x, test_set_y, test_set_y_array = dataset_test[0]
    # temporary solution to get the ground truth of sample out to test_set_y_array.
    # the reason is that after T.cast, test_set_y becomes TensorVariable, which I do not find way to output its
    # value...anyone can help?

    ### Model parameterso
    """
    learning_rate = 0.02
    n_epochs = 3000
    nkerns=[30, 40, 40] # number of kernal at each layer, current best performance is 50.0% on testing set, kernal number is [30,40,40]
    batch_size = 500

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels


    ishape = (48, 48)  # size of input images
    nClass = 7
    """
    rng = np.random.RandomState(23455)

    # Import yaml file that specifies the model to train
    # conv layer
    layer0 = ConvRectifiedLinear(
        layer_name="h2",
        output_channels=64,
        irange=0.05,
        kernel_shape=[8, 8],
        pool_shape=[4, 4],
        pool_stride=[2, 2],
        max_kernel_norm=0.9,
    )
    # mlp
    layer2 = RectifiedLinear(layer_name="h1", dim=1000, sparse_init=15)

    # softmax
    layer3 = Softmax(max_col_norm=1.9365, layer_name="y", n_classes=7, istdev=0.05)
    ds = Dataset2(train_set_x, train_set_y)
    layers = [layer0, layer2, layer3]
    ann = mlp.MLP(layers, nvis=3)
    t_algo = SGD(learning_rate=1e-1, batch_size=500, termination_criterion=EpochCounter(400))

    t_algo.setup(ann, ds)

    while True:
        trainer.train(dataset=ds)
        ann.monitor.report_epoch()
        ann.monitor()
        if not trainer.continue_learning(ann):
            break
コード例 #46
0
ファイル: junk2.py プロジェクト: caomw/Deep_wrapper
)
"""

# softmax
layer3 = Softmax(
        max_col_norm = 1.9365,
        layer_name = 'y',
        n_classes = 7,
        istdev = .05
)
layers = [layer0, layer1, layer3]
#layers = [layer0, layer2, layer3]
ann = MLP(layers, input_space=ishape)
t_algo = SGD(learning_rate = 1e-1,
        batch_size = 100,
        batches_per_iter = 1,
        termination_criterion=EpochCounter(2)
        )
     
ds = DataPylearn2([train_set_x,train_set_y],[48,48,1],7)
t_algo.setup(ann, ds)
       
while True:
    t_algo.train(dataset=ds)
    ann.monitor.report_epoch()
    ann.monitor()
    if not t_algo.continue_learning(ann):
        break

# test: https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/scripts/icml_2013_wrepl/emotions/make_submission.py
ds2 = DataPylearn2([test_set_x,test_set_y],[48,48,1],-1)
コード例 #47
0
from pylearn2.termination_criteria import EpochCounter

import theano
import numpy as np

n = 200
p = 2
X = np.random.normal(0, 1, (n, p))
y = X[:,0]* X[:, 1] + np.random.normal(0, .1, n)
y.shape = (n, 1)

ds = DenseDesignMatrix(X=X, y=y)

hidden_layer = Sigmoid(layer_name='hidden', dim=10, irange=.1, init_bias=1.)
output_layer = Linear(dim=1, layer_name='y', irange=.1)
trainer = SGD(learning_rate=.05, batch_size=10,
              termination_criterion=EpochCounter(200))
layers = [hidden_layer, output_layer]
ann = MLP(layers, nvis=2)
trainer.setup(ann, ds)

while True:
    trainer.train(dataset=ds)
    ann.monitor.report_epoch()
    ann.monitor()
    if not trainer.continue_learning(ann):
        break

inputs = X 
y_est = ann.fprop(theano.shared(inputs, name='inputs')).eval()

print(y_est.shape)
コード例 #48
0
ファイル: sequence_tagger.py プロジェクト: zseder/hunvec
class SequenceTaggerNetwork(Model):
    def __init__(self, dataset, w2i, t2i, featurizer,
                 edim=None, hdims=None, fedim=None,
                 max_epochs=100, use_momentum=False, lr=.01, lr_lin_decay=None,
                 lr_scale=False, lr_monitor_decay=False,
                 valid_stop=False, reg_factors=None, dropout=False,
                 dropout_params=None, embedding_init=None,
                 embedded_model=None, monitor_train=True, plot_monitor=None,
                 num=False):
        super(SequenceTaggerNetwork, self).__init__()
        self.vocab_size = dataset.vocab_size
        self.window_size = dataset.window_size
        self.total_feats = dataset.total_feats
        self.feat_num = dataset.feat_num
        self.n_classes = dataset.n_classes
        self.max_epochs = max_epochs
        if edim is None:
            edim = 50
        if hdims is None:
            hdims = [100]
        if fedim is None:
            fedim = 5
        self.edim = edim
        self.fedim = fedim
        self.hdims = hdims

        self.w2i = w2i
        self.t2i = t2i
        self.featurizer = featurizer

        self._create_tagger()

        A_value = numpy.random.uniform(low=-.1, high=.1,
                                       size=(self.n_classes + 2,
                                             self.n_classes))
        self.A = sharedX(A_value, name='A')
        self.use_momentum = use_momentum
        self.lr = lr
        self.lr_lin_decay = lr_lin_decay
        self.lr_monitor_decay = lr_monitor_decay
        self.lr_scale = lr_scale
        self.valid_stop = valid_stop
        self.reg_factors = reg_factors
        self.close_cache = {}
        self.dropout_params = dropout_params
        self.dropout = dropout or self.dropout_params is not None
        self.hdims = hdims
        self.monitor_train = monitor_train
        self.num = num
        self.plot_monitor = plot_monitor
        if embedding_init is not None:
            self.set_embedding_weights(embedding_init)

    def _create_tagger(self):
        self.tagger = WordTaggerNetwork(
            self.vocab_size, self.window_size, self.total_feats,
            self.feat_num, self.hdims, self.edim, self.fedim, self.n_classes)

    def _create_data_specs(self, dataset):
        self.input_space = CompositeSpace([
            dataset.data_specs[0].components[i]
            for i in xrange(len(dataset.data_specs[0].components) - 1)])
        self.output_space = dataset.data_specs[0].components[-1]

        self.input_source = dataset.data_specs[1][:-1]
        self.target_source = dataset.data_specs[1][-1]

    def __getstate__(self):
        d = {}
        d['vocab_size'] = self.vocab_size
        d['window_size'] = self.window_size
        d['feat_num'] = self.feat_num
        d['total_feats'] = self.total_feats
        d['n_classes'] = self.n_classes
        d['input_space'] = self.input_space
        d['output_space'] = self.output_space
        d['input_source'] = self.input_source
        d['target_source'] = self.target_source
        d['A'] = self.A
        d['tagger'] = self.tagger
        d['w2i'] = self.w2i
        d['t2i'] = self.t2i
        d['featurizer'] = self.featurizer
        d['max_epochs'] = self.max_epochs
        d['use_momentum'] = self.use_momentum
        d['lr'] = self.lr
        d['lr_lin_decay'] = self.lr_lin_decay
        d['lr_monitor_decay'] = self.lr_monitor_decay
        d['lr_scale'] = self.lr_scale
        d['valid_stop'] = self.valid_stop
        d['reg_factors'] = self.reg_factors
        d['dropout'] = self.dropout
        d['dropout_params'] = self.dropout_params
        d['monitor_train'] = self.monitor_train
        d['num'] = self.num
        d['plot_monitor'] = self.plot_monitor
        return d

    def fprop(self, data):
        tagger_out = self.tagger.fprop(data)
        probs = T.concatenate([self.A, tagger_out])
        return probs

    def dropout_fprop(self, data, default_input_include_prob=0.5,
                      input_include_probs=None, default_input_scale=2.0,
                      input_scales=None, per_example=True):
        if input_scales is None:
            input_scales = {'input': 1.0}
        if input_include_probs is None:
            input_include_probs = {'input': 1.0}
        if self.dropout_params is not None:
            if len(self.dropout_params) == len(self.tagger.layers) - 1:
                input_include_probs['tagger_out'] = self.dropout_params[-1]
                input_scales['tagger_out'] = 1.0/self.dropout_params[-1]
                for i, p in enumerate(self.dropout_params[:-1]):
                    input_include_probs['h{0}'.format(i)] = p
                    input_scales['h{0}'.format(i)] = 1.0/p
        tagger_out = self.tagger.dropout_fprop(
            data, default_input_include_prob, input_include_probs,
            default_input_scale, input_scales, per_example)
        probs = T.concatenate([self.A, tagger_out])
        return probs

    @functools.wraps(Model.get_lr_scalers)
    def get_lr_scalers(self):
        if not self.lr_scale:
            return {}
        d = self.tagger.get_lr_scalers()
        d[self.A] = 1. / self.n_classes
        return d

    @functools.wraps(Model.get_params)
    def get_params(self):
        return self.tagger.get_params() + [self.A]

    def create_adjustors(self):
        initial_momentum = .5
        final_momentum = .99
        start = 1
        saturate = self.max_epochs
        self.momentum_adjustor = learning_rule.MomentumAdjustor(
            final_momentum, start, saturate)
        self.momentum_rule = learning_rule.Momentum(initial_momentum,
                                                    nesterov_momentum=True)

        if self.lr_monitor_decay:
            self.learning_rate_adjustor = MonitorBasedLRAdjuster(
                high_trigger=1., shrink_amt=0.9,
                low_trigger=.95, grow_amt=1.1, channel_name='train_objective')
        elif self.lr_lin_decay:
            self.learning_rate_adjustor = LinearDecayOverEpoch(
                start, saturate, self.lr_lin_decay)

    def compute_used_inputs(self):
        seen = {'words': set(), 'feats': set()}
        for sen_w in self.dataset['train'].X1:
            seen['words'] |= reduce(
                lambda x, y: set(x) | set(y),
                sen_w, set())
        for sen_f in self.dataset['train'].X2:
            seen['feats'] |= reduce(
                lambda x, y: set(x) | set(y),
                sen_f, set())
        words = set(xrange(len(self.w2i)))
        feats = set(xrange(self.total_feats))
        self.notseen = {
            'words': numpy.array(sorted(words - seen['words'])),
            'feats': numpy.array(sorted(feats - seen['feats']))
        }

    def set_dataset(self, data):
        self._create_data_specs(data['train'])
        self.dataset = data
        self.compute_used_inputs()
        self.tagger.notseen = self.notseen

    def create_algorithm(self, data, save_best_path=None):
        self.set_dataset(data)
        self.create_adjustors()
        term = EpochCounter(max_epochs=self.max_epochs)
        if self.valid_stop:
            cost_crit = MonitorBased(channel_name='valid_objective',
                                     prop_decrease=.0, N=3)
            term = And(criteria=[cost_crit, term])

        #(layers, A_weight_decay)
        coeffs = None
        if self.reg_factors:
            rf = self.reg_factors
            lhdims = len(self.tagger.hdims)
            l_inputlayer = len(self.tagger.layers[0].layers)
            coeffs = ([[rf] * l_inputlayer] + ([rf] * lhdims) + [rf], rf)
        cost = SeqTaggerCost(coeffs, self.dropout)
        self.cost = cost

        self.mbsb = MonitorBasedSaveBest(channel_name='valid_objective',
                                         save_path=save_best_path)
        mon_dataset = dict(self.dataset)
        if not self.monitor_train:
            del mon_dataset['train']

        _learning_rule = (self.momentum_rule if self.use_momentum else None)
        self.algorithm = SGD(batch_size=1, learning_rate=self.lr,
                             termination_criterion=term,
                             monitoring_dataset=mon_dataset,
                             cost=cost,
                             learning_rule=_learning_rule,
                             )

        self.algorithm.setup(self, self.dataset['train'])
        if self.plot_monitor:
            cn = ["valid_objective", "test_objective"]
            if self.monitor_train:
                cn.append("train_objective")
            plots = Plots(channel_names=cn, save_path=self.plot_monitor)
            self.pm = PlotManager([plots], freq=1)
            self.pm.setup(self, None, self.algorithm)

    def train(self):
        while True:
            if not self.algorithm.continue_learning(self):
                break
            self.algorithm.train(dataset=self.dataset['train'])
            self.monitor.report_epoch()
            self.monitor()
            self.mbsb.on_monitor(self, self.dataset['valid'], self.algorithm)
            if self.use_momentum:
                self.momentum_adjustor.on_monitor(self, self.dataset['valid'],
                                                  self.algorithm)
            if hasattr(self, 'learning_rate_adjustor'):
                self.learning_rate_adjustor.on_monitor(
                    self, self.dataset['valid'], self.algorithm)
            if hasattr(self, 'pm'):
                self.pm.on_monitor(
                    self, self.dataset['valid'], self.algorithm)

    def prepare_tagging(self):
        X = self.get_input_space().make_theano_batch(batch_size=1)
        Y = self.fprop(X)
        self.f = theano.function([X[0], X[1]], Y)
        self.start = self.A.get_value()[0]
        self.end = self.A.get_value()[1]
        self.A_value = self.A.get_value()[2:]

    def process_input(self, words, feats):
        return self.f(words, feats)

    def tag_sen(self, words, feats, debug=False, return_probs=False):
        if not hasattr(self, 'f'):
            self.prepare_tagging()
        y = self.process_input(words, feats)
        tagger_out = y[2 + self.n_classes:]
        res = viterbi(self.start, self.A_value, self.end, tagger_out,
                               self.n_classes, return_probs)
        if return_probs:
            return res / res.sum(axis=1)[:,numpy.newaxis]
            #return res.reshape((1, len(res)))
        
        if debug:
            return numpy.array([[e] for e in res[1]]), tagger_out
        return numpy.array([[e] for e in res[1]])

    def get_score(self, dataset, mode='pwp'):
        self.prepare_tagging()
        tagged = (self.tag_sen(w, f) for w, f in
                  izip(dataset.X1, dataset.X2))
        gold = dataset.y
        good, bad = 0., 0.
        if mode == 'pwp':
            for t, g in izip(tagged, gold):
                g = g.argmax(axis=1)
                t = t.flatten()
                good += sum(t == g)
                bad += sum(t != g)
            return [good / (good + bad)]
        elif mode == 'f1':
            i2t = [t for t, i in sorted(self.t2i.items(), key=lambda x: x[1])]
            f1c = FScCounter(i2t, binary_input=False)
            gold = map(lambda x:x.argmax(axis=1), gold)
            tagged = map(lambda x:x.flatten(), tagged)
            return f1c.count_score(gold, tagged)

    def set_embedding_weights(self, embedding_init):
        # load embedding with gensim
        from gensim.models import Word2Vec
        try:
            m = Word2Vec.load_word2vec_format(embedding_init, binary=False)
            edim = m.layer1_size
        except UnicodeDecodeError:
            try:
                m = Word2Vec.load_word2vec_format(embedding_init, binary=True)
                edim = m.layer1_size
            except UnicodeDecodeError:
                # not in word2vec format
                m = Word2Vec.load(embedding_init)
                edim = m.layer1_size
        except ValueError:
            # glove model
            m = {}
            if embedding_init.endswith('gz'):
                fp = gzip.open(embedding_init)
            else:
                fp = open(embedding_init)
            for l in fp:
                le = l.split()
                m[le[0].decode('utf-8')] = numpy.array(
                    [float(e) for e in le[1:]], dtype=theano.config.floatX)
                edim = len(le) - 1

        if edim != self.edim:
            raise Exception("Embedding dim and edim doesn't match")
        m_lower = {}
        vocab = (m.vocab if hasattr(m, 'vocab') else m)
        for k in vocab:
            if k in ['UNKNOWN', 'PADDING']:
                continue
            if self.num:
                m_lower[replace_numerals(k.lower())] = m[k]
            else:
                m_lower[k.lower()] = m[k]
        # transform weight matrix with using self.w2i
        params = numpy.zeros(
            self.tagger.layers[0].layers[0].get_param_vector().shape, dtype=theano.config.floatX)
        e = self.edim
        for w in self.w2i:
            if w in m_lower:
                v = m_lower[w]
                i = self.w2i[w]
                params[i*e:(i+1)*e] = v
        if 'UNKNOWN' in vocab:
            params[-1*e:] = vocab['UNKNOWN']
        if 'PADDING' in vocab:
            params[-2*e:-1*e] = vocab['PADDING']
        self.tagger.layers[0].layers[0].set_param_vector(params)