Ejemplo n.º 1
0
def main():
    # Load data from MySQL database
    # pr = cProfile.Profile()
    # pr.enable()

    results, regions, final_data = dataLoader.load_from_sql()

    # final_data = dataManipulator.run_manipulation(results, regions, '1880-1-1', '2019-1-1')
    # final_geo_data = dataManipulator.merge_geodata(final_data)

    # pr.disable()
    # pr.print_stats(sort='cumtime')

    visualise.visualise_results(final_data)
    icr_output_dir = args.icr_output_dir
    coded_dir_path = args.coded_dir_path
    csv_by_message_output_path = args.csv_by_message_output_path
    csv_by_individual_output_path = args.csv_by_individual_output_path
    production_csv_output_path = args.production_csv_output_path

    # Load the pipeline configuration file
    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)
    Logger.set_project_name(pipeline_configuration.pipeline_name)
    log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}")

    log.info("Loading the raw data...")
    data = LoadData.load_raw_data(user, raw_data_dir, pipeline_configuration)

    log.info("Translating source Keys...")
    data = TranslateSourceKeys.translate_source_keys(user, data,
                                                     pipeline_configuration)

    if pipeline_configuration.move_ws_messages:
        log.info("Pre-filtering empty message objects...")
        # This is a performance optimisation to save execution time + memory when moving WS messages, by removing
        # the need to mark and process a high volume of empty message objects as 'NR' in WS correction.
        # Empty message objects represent flow runs where the participants never sent a message e.g. from an advert
        # flow run where we asked someone a question but didn't receive a response.
        data = MessageFilters.filter_empty_messages(data, [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ])
Ejemplo n.º 3
0
def training(lr=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500, n_layer=2):

    datasets = LoadData.load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    dict48_39 = datasets[3]

    # compute number of minibatches for training, validation and testing
    n_train_batches = math.floor(train_set_x.get_value(borrow=True).shape[0] / batch_size)
    n_valid_batches = math.floor(valid_set_x.get_value(borrow=True).shape[0] / batch_size)
    n_test_batches = math.floor(test_set_x.get_value(borrow=True).shape[0] / batch_size)

    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels

    rng = numpy.random.RandomState(1234)

    dnn = DNN.DNN(rng=rng, inputdata=x, num_in=28 * 28, num_hidden=n_hidden, num_out=10, num_layer=n_layer)

    cost = (dnn.negative_log_likelihood(y) + L1_reg * dnn.L1 + L2_reg * dnn.L2_sqr)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=dnn.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=dnn.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        }
    )

    gparams = [T.grad(cost, param) for param in dnn.params]

    updates = [(param, param - lr * gparam) for param, gparam in zip(dnn.params, gparams)]

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    #
    # --- early-stopping parameters ---
    #

    # look as this many examples regardless
    patience = 10000
    # wait this much longer when a new best is found
    patience_increase = 2
    # a relative improvement of this much is considered significant
    improvement_threshold = 0.995
    # go through this many minibatche before checking the network on the validation set;
    # in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if (
                        this_validation_loss < best_validation_loss *
                        improvement_threshold
                    ):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i
                                   in range(n_test_batches)]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.)))