Ejemplo n.º 1
0
def test_AE(data='',
            validationdata='',
            param_list=None,
            n_hidden=288,
            share=False,
            missing=True,
            missing_rate=0.2,
            learning_rate=0.08,
            training_epochs=10,
            batch_size=100,
            output_folder='dA_plots',
            order=1):

    newpath = '../Result/' + output_folder + '_' + str(missing_rate)
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    ###################################
    # Initializing training dataset    #
    ####################################
    datasets,indi_matrix,data_test,indi_matrix_test,n_train_batches,numMod,raw,trainstats_list,visible_size_Mod = \
        load_data(param_list,data,batch_size,missing_rate,train = True,order = order)

    ####################################
    # Initializing validation dataset  #
    ####################################
    valid_batch_size = 306
    validationset,indi_matrix_validation,valid_test,indi_matrix_valid_test,n_valid_batches = \
        load_data(param_list,validationdata,valid_batch_size, missing_rate,train = False,order = order)

    # start-snippet-2
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')
    y = T.matrix('y')  # the data is presented as rasterized images
    # end-snippet-2

    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    # indi_matrix = theano.shared(numpy.asarray(indi_matrix,dtype=theano.config.floatX),name='indi_matrix', borrow=True)
    #
    # datasets = datasets * indi_matrix

    ae = AE(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            indi_matrix=y,
            bias_matrix=None,
            n_visible=raw.shape[1],
            n_hidden=n_hidden,
            W1=None,
            W2=None,
            bhid=None,
            bvis=None,
            missing=missing,
            param_list=param_list,
            share=share)

    cost, updates = ae.get_cost_updates(learning_rate)

    train_ae = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: datasets[index * batch_size:(index + 1) * batch_size],
            y: indi_matrix[index * batch_size:(index + 1) * batch_size]
        },
        on_unused_input='warn',
    )

    validate_cost = ae.get_cost()

    validate_ae = theano.function(
        [index],
        validate_cost,
        givens={
            x:
            validationset[index * valid_batch_size:(index + 1) *
                          valid_batch_size],
            y:
            indi_matrix_validation[index * valid_batch_size:(index + 1) *
                                   valid_batch_size]
        },
        on_unused_input='warn',
    )

    ############
    # TRAINING #
    ############
    # go through training epochs

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    best_epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1
        c = []
        for minibatch_index in range(int(n_train_batches)):

            a = train_ae(minibatch_index)
            c.append(a)
            #print(a)
            # iteration number indicate how many batches we have already runned on
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_ae(i) for i in range(int(n_valid_batches))
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation cost %f ' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss

                    # save the best model
                    print('saving the model for epoch %i' % epoch)
                    best_epoch = epoch
                    #f = open('../Result_test/best_model_epoch_' +str() + '.txt', 'w')
                    save(newpath + '/best_model_epoch_' + str(epoch) + '.pkl',
                         ae)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete with best validation score of %f') %
          (best_validation_loss))
    print(
        sys.stderr,
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' %
         ((end_time - start_time))))

    ####################################
    # computing RMSE and error ratio #
    ####################################
    print('Best epoch is %i' % best_epoch)
    print('Now we starting computing the RMSE and error ratio')

    for i in range(best_epoch - 1):
        path = newpath + '/best_model_epoch_' + str(i + 1) + '.pkl'
        if os.path.exists(path):
            os.remove(path)

    ae = newpath + '/best_model_epoch_' + str(best_epoch) + '.pkl'

    W1, W2, b1, b2, G = load_numerical_params_ae(ae)

    bias_matrix = None

    y = get_hidden_values(data_test, indi_matrix_test, W1, b1, G, bias_matrix,
                          missing)

    y2 = get_hidden_values(valid_test, indi_matrix_valid_test, W1, b1, G,
                           bias_matrix, missing)

    h1 = newpath + '/h1.npy'

    h_valid = newpath + '/h_valid.npy'

    numpy.save(h1, y)
    numpy.save(h_valid, y2)

    reconstruction = get_reconstructed_input(y, W2, b2, G, missing)

    print(reconstruction)

    numpy.savetxt(newpath + '/output_' + str(best_epoch) + '.txt',
                  reconstruction,
                  delimiter=',')

    f = open(newpath + '/AE_' + str(best_epoch) + '.txt', 'w')

    if order == 1:

        for i in range(int(numMod)):
            numpy.savetxt(newpath + '/Raw_' + str(i) + '.txt',
                          raw[:,
                              i * visible_size_Mod:(i + 1) * visible_size_Mod],
                          delimiter=',')
            numpy.savetxt(newpath + '/Recstru_' + str(i) + '_' +
                          str(best_epoch) + '.txt',
                          denormActiv(
                              reconstruction[:, i * visible_size_Mod:(i + 1) *
                                             visible_size_Mod],
                              trainstats_list[i]),
                          delimiter=',')

            print(
                f, 'AE RMSE for Modality', i,
                str(
                    RMSE(
                        raw[:,
                            i * visible_size_Mod:(i + 1) * visible_size_Mod],
                        denormActiv(
                            reconstruction[:, i * visible_size_Mod:(i + 1) *
                                           visible_size_Mod],
                            trainstats_list[i]))))

            f.write('AE RMSE for Modality' + '\t' + str(i) + '\t' + str(
                RMSE(
                    raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                    denormActiv(
                        reconstruction[:, i * visible_size_Mod:(
                            i + 1) * visible_size_Mod], trainstats_list[i]))) +
                    '\n')

            print(
                f, 'AE error ratio for Modality', i,
                str(
                    error_ratio(
                        raw[:,
                            i * visible_size_Mod:(i + 1) * visible_size_Mod],
                        denormActiv(
                            reconstruction[:, i * visible_size_Mod:(i + 1) *
                                           visible_size_Mod],
                            trainstats_list[i]),
                        indi_matrix_test[:, i * visible_size_Mod:(i + 1) *
                                         visible_size_Mod], missing_rate)))

            f.write('AE error ratio for Modality' + '\t' + str(i) + '\t' + str(
                error_ratio(
                    raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                    denormActiv(
                        reconstruction[:, i * visible_size_Mod:(i + 1) *
                                       visible_size_Mod], trainstats_list[i]),
                    indi_matrix_test[:, i * visible_size_Mod:(i + 1) *
                                     visible_size_Mod], missing_rate)) + '\n')

            print('we are done here')
    else:

        for i in range(int(numMod)):
            numpy.savetxt(newpath + '/Raw_' + str(i) + '.txt',
                          raw[:,
                              i * visible_size_Mod:(i + 1) * visible_size_Mod],
                          delimiter=',')
            numpy.savetxt(newpath + '/Recstru_' + str(i) + '_' +
                          str(best_epoch) + '.txt',
                          reconstruction[:, i * visible_size_Mod:(i + 1) *
                                         visible_size_Mod],
                          delimiter=',')

            print(
                f, 'AE RMSE for Modality', i,
                str(
                    RMSE(
                        raw[:,
                            i * visible_size_Mod:(i + 1) * visible_size_Mod],
                        reconstruction[:, i * visible_size_Mod:(i + 1) *
                                       visible_size_Mod])))

            f.write('AE RMSE for Modality' + '\t' + str(i) + '\t' + str(
                RMSE(
                    raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                    reconstruction[:, i * visible_size_Mod:(i + 1) *
                                   visible_size_Mod])) + '\n')

            print(
                f, 'AE error ratio for Modality', i,
                str(
                    error_ratio(
                        raw[:,
                            i * visible_size_Mod:(i + 1) * visible_size_Mod],
                        reconstruction[:, i * visible_size_Mod:(i + 1) *
                                       visible_size_Mod])))

            f.write('AE error ratio for Modality' + '\t' + str(i) + '\t' + str(
                error_ratio(
                    raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                    reconstruction[:, i * visible_size_Mod:(i + 1) *
                                   visible_size_Mod])) + '\n')

    if order == 1:

        print(
            '...This is the missing case, we predict the reconstruction error ratio for the test data...'
        )

        raw_test, data_real_test, teststats_list, indi_matrix_final_test = load_test_data(
            data, numMod, missing_rate)

        y1 = get_hidden_values(data_real_test, indi_matrix_final_test, W1, b1,
                               G, bias_matrix, missing)

        reconstruction_test = get_reconstructed_input(y1, W2, b2, G, missing)

        print(reconstruction_test)

        numpy.savetxt(newpath + '/output_test_' + str(best_epoch) + '.txt',
                      reconstruction_test,
                      delimiter=',')

        f = open(newpath + '/AE_test_' + str(best_epoch) + '.txt', 'w')

        for i in range(int(numMod)):
            numpy.savetxt(newpath + '/Raw_test_' + str(i) + '.txt',
                          raw_test[:, i * visible_size_Mod:(i + 1) *
                                   visible_size_Mod],
                          delimiter=',')
            numpy.savetxt(
                newpath + '/Recstru_test_' + str(i) + '_' + str(best_epoch) +
                '.txt',
                denormActiv(
                    reconstruction_test[:, i * visible_size_Mod:(i + 1) *
                                        visible_size_Mod], teststats_list[i]),
                delimiter=',')

            print(
                f, 'AE RMSE for Modality', i,
                str(
                    RMSE(
                        raw_test[:, i * visible_size_Mod:(i + 1) *
                                 visible_size_Mod],
                        denormActiv(
                            reconstruction_test[:,
                                                i * visible_size_Mod:(i + 1) *
                                                visible_size_Mod],
                            teststats_list[i]))))

            f.write('AE RMSE for Modality' + '\t' + str(i) + '\t' + str(
                RMSE(
                    raw_test[:,
                             i * visible_size_Mod:(i + 1) * visible_size_Mod],
                    denormActiv(
                        reconstruction_test[:, i * visible_size_Mod:(
                            i + 1) * visible_size_Mod], teststats_list[i]))) +
                    '\n')

            print(
                f, 'AE error ratio for Modality', i,
                str(
                    error_ratio(
                        raw_test[:, i * visible_size_Mod:(i + 1) *
                                 visible_size_Mod],
                        denormActiv(
                            reconstruction_test[:,
                                                i * visible_size_Mod:(i + 1) *
                                                visible_size_Mod],
                            teststats_list[i]),
                        indi_matrix_final_test[:,
                                               i * visible_size_Mod:(i + 1) *
                                               visible_size_Mod],
                        missing_rate)))

            f.write('AE error ratio for Modality' + '\t' + str(i) + '\t' + str(
                error_ratio(
                    raw_test[:,
                             i * visible_size_Mod:(i + 1) * visible_size_Mod],
                    denormActiv(
                        reconstruction_test[:, i * visible_size_Mod:(
                            i + 1) * visible_size_Mod], teststats_list[i]),
                    indi_matrix_final_test[:, i * visible_size_Mod:(i + 1) *
                                           visible_size_Mod], missing_rate)) +
                    '\n')

        return ae, h1, h_valid, indi_matrix_test, indi_matrix_valid_test, indi_matrix_final_test

    if order == 2:

        return ae, h1, h_valid, indi_matrix_test, indi_matrix_valid_test
def test_AE(data='',validationdata= '', param_list= None,n_hidden = 288,share= False,missing = True,
            missing_rate = 0.2,learning_rate=0.08, training_epochs= 10,
            batch_size= 100, output_folder='dA_plots',order = 1):


    newpath = '../Result/'+ output_folder + '_' + str(missing_rate)
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    ###################################
    # Initializing training dataset    #
    ####################################
    datasets,indi_matrix,data_test,indi_matrix_test,n_train_batches,numMod,raw,trainstats_list,visible_size_Mod = \
        load_data(param_list,data,batch_size,missing_rate,train = True,order = order)

    ####################################
    # Initializing validation dataset  #
    ####################################
    valid_batch_size = 306
    validationset,indi_matrix_validation,valid_test,indi_matrix_valid_test,n_valid_batches = \
        load_data(param_list,validationdata,valid_batch_size, missing_rate,train = False,order = order)


    # start-snippet-2
    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    x = T.matrix('x')
    y = T.matrix('y')# the data is presented as rasterized images
    # end-snippet-2

    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))


    # indi_matrix = theano.shared(numpy.asarray(indi_matrix,dtype=theano.config.floatX),name='indi_matrix', borrow=True)
    #
    # datasets = datasets * indi_matrix

    ae = AE(
        numpy_rng = rng,
        theano_rng= theano_rng,
        input=x,
        indi_matrix = y,
        bias_matrix = None,
        n_visible = raw.shape[1],
        n_hidden = n_hidden ,
        W1 = None,
        W2 = None,
        bhid=None,
        bvis=None,
        missing = missing,
        param_list = param_list,
        share = share
    )

    cost,updates = ae.get_cost_updates(learning_rate)

    train_ae = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: datasets[index * batch_size: (index + 1) * batch_size],
            y: indi_matrix[index * batch_size: (index + 1) * batch_size]
        },
        on_unused_input='warn',
    )


    validate_cost = ae.get_cost()

    validate_ae = theano.function(
        [index],
        validate_cost,
        givens={
            x: validationset[index * valid_batch_size: (index + 1) * valid_batch_size],
            y: indi_matrix_validation[index * valid_batch_size: (index + 1) * valid_batch_size]
        },
        on_unused_input='warn',
    )



    ############
    # TRAINING #
    ############
    # go through training epochs

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    best_epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1
        c= []
        for minibatch_index in range(int(n_train_batches)):

            a = train_ae(minibatch_index)
            c.append(a)
            #print(a)
            # iteration number indicate how many batches we have already runned on
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_ae(i)
                                     for i in range(int(n_valid_batches))]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation cost %f ' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss

                    # save the best model
                    print('saving the model for epoch %i' % epoch)
                    best_epoch = epoch
                    #f = open('../Result_test/best_model_epoch_' +str() + '.txt', 'w')
                    save(newpath + '/best_model_epoch_' +str(epoch) +'.pkl', ae)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(
        (
            'Optimization complete with best validation score of %f'

        )
        % (best_validation_loss)
    )
    print( sys.stderr, ('The code for file ' +os.path.split(__file__)[1]
                          +' ran for %.1fs' % ((end_time - start_time))))



    ####################################
    # computing RMSE and error ratio #
    ####################################
    print('Best epoch is %i' % best_epoch )
    print('Now we starting computing the RMSE and error ratio')

    for i in range(best_epoch -1):
        path = newpath + '/best_model_epoch_' +str(i+1) +'.pkl'
        if os.path.exists(path):
            os.remove(path)

    ae = newpath + '/best_model_epoch_' +str(best_epoch) +'.pkl'

    W1,W2,b1,b2,G = load_numerical_params_ae(ae)


    bias_matrix = None


    y = get_hidden_values(data_test,indi_matrix_test,W1,b1,G,bias_matrix,missing)

    y2 = get_hidden_values(valid_test,indi_matrix_valid_test,W1,b1,G,bias_matrix,missing)

    h1 = newpath + '/h1.npy'

    h_valid = newpath + '/h_valid.npy'

    numpy.save(h1,y)
    numpy.save(h_valid,y2)


    reconstruction = get_reconstructed_input(y,W2,b2,G,missing)

    print(reconstruction)

    numpy.savetxt(newpath+ '/output_' +str(best_epoch) + '.txt',reconstruction,delimiter=',')


    f = open(newpath +'/AE_' +str(best_epoch) + '.txt', 'w')

    if order == 1:

        for i in range(int(numMod)):
            numpy.savetxt(newpath +'/Raw_' +str(i) + '.txt',
            raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],delimiter=',')
            numpy.savetxt(newpath+'/Recstru_' +str(i) + '_' +str(best_epoch) + '.txt',
            denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],trainstats_list[i]),delimiter=',')



            print(f, 'AE RMSE for Modality', i, str(RMSE(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                           trainstats_list[i]))))


            f.write('AE RMSE for Modality'+ '\t' + str(i) +'\t'+ str(RMSE(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                              trainstats_list[i]))) + '\n')


            print(f, 'AE error ratio for Modality', i, str(error_ratio(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                              denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                       trainstats_list[i]),indi_matrix_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],missing_rate)))

            f.write('AE error ratio for Modality'+ '\t' + str(i) +'\t'+ str(error_ratio(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                              denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                          trainstats_list[i]),indi_matrix_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],missing_rate)) + '\n')

            print('we are done here')
    else:

        for i in range(int(numMod)):
            numpy.savetxt(newpath +'/Raw_' +str(i) + '.txt',
            raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],delimiter=',')
            numpy.savetxt(newpath+'/Recstru_' +str(i) + '_' +str(best_epoch) + '.txt',
           reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],delimiter=',')



            print(f, 'AE RMSE for Modality', i, str(RMSE(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod])))


            f.write('AE RMSE for Modality'+ '\t' + str(i) +'\t'+ str(RMSE(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod])) + '\n')


            print(f, 'AE error ratio for Modality', i, str(error_ratio(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod]
                                                           )))

            f.write('AE error ratio for Modality'+ '\t' + str(i) +'\t'+ str(error_ratio(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod])) + '\n')

    if order == 1:

        print('...This is the missing case, we predict the reconstruction error ratio for the test data...')

        raw_test,data_real_test,teststats_list,indi_matrix_final_test = load_test_data(data,numMod,missing_rate)

        y1 = get_hidden_values(data_real_test,indi_matrix_final_test,W1,b1,G,bias_matrix,missing)

        reconstruction_test = get_reconstructed_input(y1,W2,b2,G,missing)

        print(reconstruction_test)

        numpy.savetxt(newpath+ '/output_test_' +str(best_epoch) + '.txt',reconstruction_test,delimiter=',')

        f = open(newpath +'/AE_test_' +str(best_epoch) + '.txt', 'w')

        for i in range(int(numMod)):
            numpy.savetxt(newpath +'/Raw_test_' +str(i) + '.txt',
            raw_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],delimiter=',')
            numpy.savetxt(newpath+'/Recstru_test_' +str(i) + '_' +str(best_epoch) + '.txt',
            denormActiv(reconstruction_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],teststats_list[i]),delimiter=',')


            print(f, 'AE RMSE for Modality', i, str(RMSE(raw_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  denormActiv(reconstruction_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                           teststats_list[i]))))


            f.write('AE RMSE for Modality'+ '\t' + str(i) +'\t'+ str(RMSE(raw_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  denormActiv(reconstruction_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                              teststats_list[i]))) + '\n')


            print(f, 'AE error ratio for Modality', i, str(error_ratio(raw_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  denormActiv(reconstruction_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                           teststats_list[i]),indi_matrix_final_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],missing_rate)))

            f.write('AE error ratio for Modality'+ '\t' + str(i) +'\t'+ str(error_ratio(raw_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                  denormActiv(reconstruction_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                              teststats_list[i]),indi_matrix_final_test[:,i*visible_size_Mod:(i+1)*visible_size_Mod],missing_rate)) + '\n')

        return  ae, h1, h_valid, indi_matrix_test,indi_matrix_valid_test,indi_matrix_final_test

    if order == 2:

        return ae, h1, h_valid, indi_matrix_test,indi_matrix_valid_test
Ejemplo n.º 3
0
def test_SAE(data='',
             validationdata='',
             param_list=[],
             n_hidden1=288,
             n_hidden2=100,
             missing1=False,
             missing2=False,
             share1=False,
             share2=True,
             missingrate1=0,
             missingrate2=0,
             learningrate=0.08,
             training_epochs=3,
             batch_size=3000,
             output_folder=''):

    newpath = '../Result/' + output_folder
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    ae1, h1, h_valid, indi_matrix_test,indi_matrix_valid_test = \
        build_block(data,validationdata, param_list,n_hidden1,share1,missing1, missingrate1,
                      learningrate, training_epochs,
                      batch_size, output_folder='m_HI_1',order = 1)

    print('Fininshed training the first auto encoder')


    ae2,h2,h2_valid,indi_matrix_test2,indi_matrix_valid_test2 = \
        build_block(h1, h_valid, param_list,n_hidden2,share2,missing = missing2, missing_rate = missingrate2,
                      learning_rate=learningrate, training_epochs= training_epochs,
                      batch_size = 30, output_folder='m_HI_2',order = 2)

    print('Fininshed training the second auto encoder')


    datasets,indi_matrix,data_test,n_train_batches,numMod,raw,trainstats_list,visible_size_Mod =\
        load_data_sae(param_list,data,indi_matrix_test,batch_size,train = True)

    valid_batch_size = 306
    validset,valid_indi_matrix, n_valid_batches =\
        load_data_sae(param_list,validationdata,indi_matrix_valid_test,valid_batch_size,train = False)

    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')
    y = T.matrix('y')
    # end-snippet-2

    sae = SAE(
        ae1=ae1,
        ae2=ae2,
        input=x,
        indi_matrix=y,
        missing=missing1,
        param_list=param_list,
    )

    # datasets = np.load(data)
    #
    # indi_matrix = [0]*raw.shape[1]
    # for i in range(raw.shape[1]):
    #     indi_matrix[i] = np.random.binomial(1, 1-missingrate1,(raw.shape[0],1))
    # indi_matrix = np.concatenate(indi_matrix,axis = 1)
    # datasets = theano.shared(np.asarray(datasets,dtype=theano.config.floatX),name='datasets', borrow=True)
    # indi_matrix = theano.shared(np.asarray(indi_matrix,dtype = theano.config.floatX ),name='indi_matrix', borrow=True)

    cost, updates = sae.finetuning(learningrate)

    train_sae = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: datasets[index * batch_size:(index + 1) * batch_size],
            y: indi_matrix[index * batch_size:(index + 1) * batch_size]
        },
        on_unused_input='warn',
    )

    validate_cost = sae.get_cost()

    validate_sae = theano.function(
        [index],
        validate_cost,
        givens={
            x:
            validset[index * valid_batch_size:(index + 1) * valid_batch_size],
            y:
            valid_indi_matrix[index * valid_batch_size:(index + 1) *
                              valid_batch_size]
        },
        on_unused_input='warn',
    )

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the sae model')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = np.inf
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    best_epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1
        c = []
        for minibatch_index in range(int(n_train_batches)):

            a = train_sae(minibatch_index)
            c.append(a)
            print(a)
            # iteration number indicate how many batches we have already runned on
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_sae(i) for i in range(int(n_valid_batches))
                ]
                this_validation_loss = np.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation cost %f ' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss

                    # save the best model
                    print('saving the model for epoch %i' % epoch)
                    best_epoch = epoch
                    #f = open('../Result_test/best_model_epoch_' +str() + '.txt', 'w')
                    save(newpath + '/best_model_epoch_' + str(epoch) + '.pkl',
                         sae)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete with best validation score of %f') %
          (best_validation_loss))
    print(
        sys.stderr,
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' %
         ((end_time - start_time))))

    print('Best epoch is %i' % best_epoch)
    print('Now we starting computing the RMSE and error ratio')

    sae = newpath + '/best_model_epoch_' + str(best_epoch) + '.pkl'

    W1, W2, b1, b2, G, W3, W4, b3, b4, G_share = load_numerical_params_sae(sae)

    h1 = get_h1(missing1, data_test, W1, b1, G)
    h2 = get_h2(h1, W2, b2, G_share)
    h3 = get_h3(h2, W3, b3, G_share)
    reconstruction = get_reconstruct(missing1, h3, W4, b4, G)

    print(reconstruction)

    np.savetxt(newpath + '/output_' + str(best_epoch) + '.txt',
               reconstruction,
               delimiter=',')

    f = open(newpath + '/AE_' + str(best_epoch) + '.txt', 'w')

    for i in range(int(numMod)):
        np.savetxt(newpath + '/Raw_' + str(i) + '.txt',
                   raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                   delimiter=',')
        np.savetxt(newpath + '/Recstru_' + str(i) + '_' + str(best_epoch) +
                   '.txt',
                   denormActiv(
                       reconstruction[:, i * visible_size_Mod:(i + 1) *
                                      visible_size_Mod], trainstats_list[i]),
                   delimiter=',')

        print(
            f, 'AE RMSE for Modality', i,
            str(
                RMSE(
                    raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                    denormActiv(
                        reconstruction[:, i * visible_size_Mod:(i + 1) *
                                       visible_size_Mod],
                        trainstats_list[i]))))

        f.write('AE RMSE for Modality' + '\t' + str(i) + '\t' + str(
            RMSE(
                raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                denormActiv(
                    reconstruction[:, i * visible_size_Mod:(i + 1) *
                                   visible_size_Mod], trainstats_list[i]))) +
                '\n')

        print(
            f, 'AE error ratio for Modality', i,
            str(
                error_ratio(
                    raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                    denormActiv(
                        reconstruction[:, i * visible_size_Mod:(i + 1) *
                                       visible_size_Mod],
                        trainstats_list[i]))))

        f.write('AE error ratio for Modality' + '\t' + str(i) + '\t' + str(
            error_ratio(
                raw[:, i * visible_size_Mod:(i + 1) * visible_size_Mod],
                denormActiv(
                    reconstruction[:, i * visible_size_Mod:(i + 1) *
                                   visible_size_Mod], trainstats_list[i]))) +
                '\n')
Ejemplo n.º 4
0
def test_SAE(data = '',validationdata='',param_list= [], n_hidden1 = 288,n_hidden2 = 100,missing1= False, missing2 = False,share1 = False,share2 = True,
             missingrate1 = 0, missingrate2 = 0,learningrate = 0.08, training_epochs = 3,batch_size = 3000,
             output_folder = ''):


    newpath = '../Result/'+ output_folder
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    # ae1, h1, h_valid, indi_matrix_test,indi_matrix_valid_test = \
    #     build_block(data,validationdata, param_list,288,share1,missing1, missingrate1,
    #                   learningrate, training_epochs,
    #                   batch_size, output_folder='m_HI_1',order = 1)
    #
    # print('Fininshed training the first auto encoder')
    #
    #
    # ae2,h2,h2_valid,indi_matrix_test2,indi_matrix_valid_test2 = \
    #     build_block(h1, h_valid, param_list,100,share2,missing = missing2, missing_rate = missingrate2,
    #                   learning_rate=learningrate, training_epochs= training_epochs,
    #                   batch_size = 30, output_folder='m_HI_2',order = 2)
    #
    # print('Fininshed training the second auto encoder')
    #
    #
    #
    #
    # datasets,indi_matrix,data_test,n_train_batches,numMod,raw,trainstats_list,visible_size_Mod =\
    #     load_data_sae(param_list,data,indi_matrix_test,batch_size,train = True)
    #
    #
    # valid_batch_size = 306
    # validset,valid_indi_matrix, n_valid_batches =\
    #     load_data_sae(param_list,validationdata,indi_matrix_valid_test,valid_batch_size,train = False)
    #


    datasets,indi_matrix,data_test,indi_matrix_test,n_train_batches,numMod,raw,trainstats_list,visible_size_Mod = \
        load_data(param_list,data,batch_size,missingrate1,train = True,order = 1)

    ####################################
    # Initializing validation dataset  #
    ####################################
    valid_batch_size = 306

    validset,valid_indi_matrix,valid_test,indi_matrix_valid_test,n_valid_batches = \
        load_data(param_list,validationdata,valid_batch_size, missingrate1,train = False,order = 1)

    ae1 = '../Result/m_HI_1/best_model_epoch_10.pkl'
    ae2 = '../Result/m_HI_1/best_model_epoch_10.pkl'


    index = T.lscalar()    # index to a [mini]batch
    x = T.matrix('x')
    y = T.matrix('y')
    # end-snippet-2


    sae = SAE(
        ae1=ae1,
        ae2=ae2,
        input = x,
        indi_matrix = y,
        missing = True,
        param_list = param_list,
    )



    cost,updates = sae.finetuning(learningrate)

    train_sae = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: datasets[index * batch_size: (index + 1) * batch_size],
            y: indi_matrix[index * batch_size: (index + 1) * batch_size]
        },
        mode='DebugMode',
        on_unused_input='warn'
    )


    validate_cost = sae.get_cost()

    validate_sae = theano.function(
        [index],
        validate_cost,
        givens={
            x: validset[index * valid_batch_size: (index + 1) * valid_batch_size],
            y: valid_indi_matrix[index * valid_batch_size: (index + 1) * valid_batch_size]
        },
        on_unused_input='warn'
    )


    ###############
    # TRAIN MODEL #
    ###############
    print('... training the sae model...')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = np.inf
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    best_epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1
        c= []
        print('...Starting training with SGD and check validation here...')
        print(n_train_batches)
        for minibatch_index in range(int(n_train_batches)):

            a = train_sae(minibatch_index)
            c.append(a)
            print(a)
            # iteration number indicate how many batches we have already runned on
            iter = (epoch - 1) * n_train_batches + minibatch_index
            print('...starting validation here...')

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_sae(i)
                                     for i in range(int(n_valid_batches))]
                this_validation_loss = np.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation cost %f ' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss

                    # save the best model
                    print('saving the model for epoch %i' % epoch)
                    best_epoch = epoch
                    #f = open('../Result_test/best_model_epoch_' +str() + '.txt', 'w')
                    save(newpath + '/best_model_epoch_' +str(epoch) +'.pkl', sae)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(
        (
            'Optimization complete with best validation score of %f'

        )
        % (best_validation_loss)
    )
    print( sys.stderr, ('The code for file ' +os.path.split(__file__)[1]
                          +' ran for %.1fs' % ((end_time - start_time))))


    print('Best epoch is %i' % best_epoch )
    print('Now we starting computing the RMSE and error ratio')

    sae = newpath + '/best_model_epoch_' +str(best_epoch) +'.pkl'

    W1,W2,b1,b2,G,W3,W4,b3,b4,G_share = load_numerical_params_sae(sae)

    h1 = get_h1(missing1,data_test,W1,b1,G)
    h2 = get_h2(h1,W2,b2,G_share)
    h3 = get_h3(h2,W3,b3,G_share)
    reconstruction = get_reconstruct(missing1,h3,W4,b4,G)

    print(reconstruction)

    np.savetxt(newpath+ '/output_' +str(best_epoch) + '.txt',reconstruction,delimiter=',')


    f = open(newpath +'/AE_' +str(best_epoch) + '.txt', 'w')


    for i in range(int(numMod)):
        np.savetxt(newpath +'/Raw_' +str(i) + '.txt',
        raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],delimiter=',')
        np.savetxt(newpath+'/Recstru_' +str(i) + '_' +str(best_epoch) + '.txt',
        denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],trainstats_list[i]),delimiter=',')



        print(f, 'AE RMSE for Modality', i, str(RMSE(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                              denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                       trainstats_list[i]))))


        f.write('AE RMSE for Modality'+ '\t' + str(i) +'\t'+ str(RMSE(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                              denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                          trainstats_list[i]))) + '\n')


        print(f, 'AE error ratio for Modality', i, str(error_ratio(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                              denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                       trainstats_list[i]))))

        f.write('AE error ratio for Modality'+ '\t' + str(i) +'\t'+ str(error_ratio(raw[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                              denormActiv(reconstruction[:,i*visible_size_Mod:(i+1)*visible_size_Mod],
                                                          trainstats_list[i]))) + '\n')