def main(): # Load data from MySQL database # pr = cProfile.Profile() # pr.enable() results, regions, final_data = dataLoader.load_from_sql() # final_data = dataManipulator.run_manipulation(results, regions, '1880-1-1', '2019-1-1') # final_geo_data = dataManipulator.merge_geodata(final_data) # pr.disable() # pr.print_stats(sort='cumtime') visualise.visualise_results(final_data)
icr_output_dir = args.icr_output_dir coded_dir_path = args.coded_dir_path csv_by_message_output_path = args.csv_by_message_output_path csv_by_individual_output_path = args.csv_by_individual_output_path production_csv_output_path = args.production_csv_output_path # Load the pipeline configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") log.info("Loading the raw data...") data = LoadData.load_raw_data(user, raw_data_dir, pipeline_configuration) log.info("Translating source Keys...") data = TranslateSourceKeys.translate_source_keys(user, data, pipeline_configuration) if pipeline_configuration.move_ws_messages: log.info("Pre-filtering empty message objects...") # This is a performance optimisation to save execution time + memory when moving WS messages, by removing # the need to mark and process a high volume of empty message objects as 'NR' in WS correction. # Empty message objects represent flow runs where the participants never sent a message e.g. from an advert # flow run where we asked someone a question but didn't receive a response. data = MessageFilters.filter_empty_messages(data, [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ])
def training(lr=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500, n_layer=2): datasets = LoadData.load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] dict48_39 = datasets[3] # compute number of minibatches for training, validation and testing n_train_batches = math.floor(train_set_x.get_value(borrow=True).shape[0] / batch_size) n_valid_batches = math.floor(valid_set_x.get_value(borrow=True).shape[0] / batch_size) n_test_batches = math.floor(test_set_x.get_value(borrow=True).shape[0] / batch_size) index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of [int] labels rng = numpy.random.RandomState(1234) dnn = DNN.DNN(rng=rng, inputdata=x, num_in=28 * 28, num_hidden=n_hidden, num_out=10, num_layer=n_layer) cost = (dnn.negative_log_likelihood(y) + L1_reg * dnn.L1 + L2_reg * dnn.L2_sqr) # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=dnn.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=dnn.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] } ) gparams = [T.grad(cost, param) for param in dnn.params] updates = [(param, param - lr * gparam) for param, gparam in zip(dnn.params, gparams)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) ############### # TRAIN MODEL # ############### print('... training') # # --- early-stopping parameters --- # # look as this many examples regardless patience = 10000 # wait this much longer when a new best is found patience_increase = 2 # a relative improvement of this much is considered significant improvement_threshold = 0.995 # go through this many minibatche before checking the network on the validation set; # in this case we check every epoch validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)))