def test_hmm(gen_hmm, test_names, window_size, rank, start_base):
    test_reader = ICHISeqDataReader(test_names)
    n_test_patients = len(test_names)

    for i in xrange(n_test_patients):
        #get data divided on sequences with respect to labels
        test_x, test_y = test_reader.read_one_with_window(
            window_size = window_size,
            divide = False
        )
        test_x = create_labels(
            matrix = test_x.eval(),
            rank=rank,
            window_size = window_size,
            start_base=start_base
        )
        
        #compute mean error value for one patient in test set
        patient_error = mean_error(
            gen_hmm = gen_hmm,
            obs_seq = test_x,
            actual_states = test_y.eval()
        )
        
        print(patient_error, ' error for patient ' + str(test_names[i]))

        gc.collect()  
def test_sda(sda, test_names, base, window_size=1, algo='viterbi'):
    test_reader = ICHISeqDataReader(test_names)
    test_set_x, test_set_y = test_reader.read_all()
    
    n_test_patients = len(test_names)
    
    for test_patient in xrange(n_test_patients):
        #get data divided on sequences with respect to labels
        test_set_x, test_set_y = test_reader.read_next_doc()
        test_x_array = test_set_x.get_value()
        n_test_times = test_x_array.shape[0] - window_size + 1
        test_visible_after_sda = numpy.array([sda.get_da_output(
                test_x_array[time: time+window_size]).ravel()
                for time in xrange(n_test_times)]).ravel()
                            
        new_test_visible, new_test_hidden = change_data_for_one_patient(
            hiddens_patient=test_set_y.eval(),
            visibles_patient=test_visible_after_sda,
            window_size=sda.da_layers_output_size,
            base_for_labels=base
        )
        
        patient_error = get_error_on_patient(
            model=sda.hmmLayer,
            visible_set=new_test_visible,
            hidden_set=new_test_hidden,
            algo=algo
        )
        
        print(patient_error, ' error for patient ' + str(test_patient))
        gc.collect()
 def validate_model(self, valid_names, window_size, rank, start_base):
     valid_reader = ICHISeqDataReader(valid_names)
     all_valid_x = []
     all_valid_y = []
     for i in xrange (len(valid_names)):
         valid_x, valid_y = valid_reader.read_one_with_window(
             window_size = window_size,
             divide = False
         )
         valid_x = create_labels(
             matrix = valid_x.eval(),
             rank=rank,
             start_base=start_base
         )
         all_valid_x = numpy.concatenate((all_valid_x, valid_x))
         all_valid_y = numpy.concatenate((all_valid_y, valid_y.eval()))
     print(len(all_valid_x), 'x')
     print(len(all_valid_y), 'y')
     #compute mean error value for patients in validation set
     error = mean_error(
         gen_hmm = self,
         obs_seq = all_valid_x,
         actual_states = all_valid_y
     )
     return error
Example #4
0
def pretrain_sda_cg(sda, train_names, window_size, pretraining_epochs, corruption_levels):
    ## Pre-train layer-wise
    print "... getting the pretraining functions"
    import scipy.optimize

    train_reader = ICHISeqDataReader(train_names)
    n_train_patients = len(train_names)

    for patients in xrange(n_train_patients):
        train_set_x, train_set_y = train_reader.read_next_doc()
        pretraining_fn, pretraining_update = pretraining_functions_sda_cg(
            sda=sda, train_set_x=train_set_x, window_size=window_size, corruption_levels=corruption_levels
        )
        print "... pre-training the model"
        # using scipy conjugate gradient optimizer
        print ("Optimizing using scipy.optimize.fmin_cg...")
        for i in xrange(sda.n_layers):
            best_w_b = scipy.optimize.fmin_cg(
                f=partial(pretraining_fn, da_index=i),
                x0=numpy.zeros(
                    (sda.dA_layers[i].n_visible + 1) * sda.dA_layers[i].n_hidden, dtype=sda.dA_layers[i].input.dtype
                ),
                fprime=partial(pretraining_update, da_index=i),
                maxiter=pretraining_epochs,
            )
    return sda
def pretrain_sda_sgd(sda, train_names, window_size, pretraining_epochs,
                  pretrain_lr, corruption_levels):
    # compute number of examples given in training set
    n_train_patients =  len(train_names)
    
    print '... getting the pretraining functions'
    pretraining_fns = pretraining_functions_sda_sgd(sda=sda,
                                                    window_size=window_size)

    print '... pre-training the model'
    ## Pre-train layer-wise
    for i in xrange(sda.n_layers):
        cur_dA = sda.dA_layers[i]
        cur_dA.train_cost_array = []
        cur_train_cost = []
        train_reader = ICHISeqDataReader(train_names)
        for patients in xrange(n_train_patients):
            # go through the training set
            train_set_x, train_set_y = train_reader.read_next_doc()
            n_train_samples = train_set_x.get_value(borrow=True).shape[0] - window_size + 1
            cur_train_cost.append([])            
            # go through pretraining epochs
            for epoch in xrange(pretraining_epochs):
                cur_epoch_cost=[]                               
                for index in xrange(n_train_samples):
                    cur_epoch_cost.append(pretraining_fns[i](index=index,
                             train_set = train_set_x.get_value(borrow=True),
                             corruption=corruption_levels[i],
                             lr=pretrain_lr))
                cur_train_cost[-1].append(numpy.mean(cur_epoch_cost))
            gc.collect()
            
        cur_dA.train_cost_array = [[epoch, cost] for epoch, cost in zip(xrange(pretraining_epochs), numpy.mean(cur_train_cost, axis=0))]
    return sda
def test_sda(sda, test_names, rank, start_base, window_size=1, algo='viterbi'):
    test_reader = ICHISeqDataReader(test_names)    
    n_test_patients = len(test_names)
    
    for test_patient in xrange(n_test_patients):
        test_set_x, test_set_y = test_reader.read_one_with_window(
            window_size=window_size,
            divide=False
        )
        test_set_x = test_set_x.eval()
        test_set_y = test_set_y.eval()
        n_test_times = test_set_x.shape[0]
        
        test_visible_after_sda = numpy.array([sda.get_da_output(
                numpy.array(test_set_x[time]).reshape(1, -1))
                for time in xrange(n_test_times)])
                    
        new_test_visible = create_labels_after_das(
            da_output_matrix=test_visible_after_sda,
            rank=rank,
            start_base=start_base
        )
        '''
        n_patient_samples = len(test_set_y)
        half_window_size = int(window_size/2)
        new_test_hidden=test_set_y[half_window_size:n_patient_samples-half_window_size]
        '''
        predicted_states = sda.hmmLayer.define_labels_seq(new_test_visible)
        error_array=errors(predicted_states=numpy.array(predicted_states),
                       actual_states=numpy.array(test_set_y))
                       
        patient_error = error_array.eval().mean()
        
        print(patient_error, ' error for patient ' + str(test_patient))
        gc.collect()  
 def train(self, train_names, valid_names, window_size, rank, start_base):
     train_reader = ICHISeqDataReader(train_names)
     n_train_patients = len(train_names)
     #train hmms on data of each pattient
     for train_patient in xrange(n_train_patients):
         #get data divided on sequences with respect to labels
         train_set = train_reader.read_one_with_window(
             window_size = window_size,
             divide = True
         )
         for i in xrange(self.n_hmms):
             #get (avg, disp) labels for x-values
             x_labels = create_labels(
                 matrix = train_set[i].eval(),
                 rank=rank,
                 start_base=start_base
             )
             self.hmm_models[i].fit([numpy.array(x_labels).reshape(-1, 1)])
                     
         error_cur_epoch = self.validate_model(
             valid_names = valid_names,
             window_size = window_size,
             rank = rank,
             start_base = start_base
         )
         self.valid_error_array.append([])
         self.valid_error_array[-1].append(train_patient)
         self.valid_error_array[-1].append(error_cur_epoch)
         
         gc.collect()
def test_da_params(corruption_level):
    window_sizes = [13, 30, 50, 75, 100]
    
    train_data = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    
    train_reader = ICHISeqDataReader(train_data)
    train_set, train_labels = train_reader.read_all()
    
    output_folder=('[%s]')%(",".join(train_data))
    
    for ws in window_sizes:
        train_dA(training_epochs=1,
                 window_size = ws, 
                 corruption_level=corruption_level,
                 n_hidden=ws*2,
                 dataset=train_set,
                 output_folder=output_folder,
                 base_folder='dA_cg_plots')
def train():
    #train_data_names = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    train_data_names = ['p10a','p011','p013']
    valid_data = ['p09b','p023','p035','p038']
    test_data = ['p09a','p033']
        
    train_reader = ICHISeqDataReader(train_data_names)
    #get data divided on sequences with respect to labels
    train_set_x, train_set_y = train_reader.read_all_for_second_hmm()
        
    valid_reader = ICHISeqDataReader(valid_data)
    valid_set_x, valid_set_y = valid_reader.read_all_for_second_hmm()
    
    test_reader = ICHISeqDataReader(test_data)
    test_set_x, test_set_y = test_reader.read_all_for_second_hmm()
    
    datasets = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
        (test_set_x, test_set_y)]
    
    debug_file.write('data is got')
    
    rank = 1
    base = pow(10, rank) + 1
    n_visible_labels = pow(base, 3)

    trained_HMM = HMM_second(n_visible=n_visible_labels,
                      train_set=(train_set_x, train_set_y),
                      train_patient_list = train_data_names)
    gc.collect()                
    debug_file.write('Hmm created')
    debug_file.write('Start validation')
    validation(HMM = trained_HMM,
               patient_list = valid_data,
               valid_set = (valid_set_x, valid_set_y))
def test_all_params():
    learning_rates = [0.0001]
    window_sizes = [13]
    
    train_data = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    valid_data = ['p09b','p023','p035','p038']
    test_data = ['p09a','p033']
    
    train_reader = ICHISeqDataReader(train_data)
    train_set_x, train_set_y = train_reader.read_all()
    
    valid_reader = ICHISeqDataReader(valid_data)
    valid_set_x, valid_set_y = valid_reader.read_all()

    test_reader = ICHISeqDataReader(test_data)
    test_set_x, test_set_y = test_reader.read_all()
    
    datasets = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
            (test_set_x, test_set_y)]
            
    output_folder=('[%s], [%s], [%s]')%(",".join(train_data), ",".join(valid_data), ",".join(test_data))

    for lr in learning_rates:
        for ws in window_sizes:
            test_params(learning_rate=lr,
                        n_epochs=1,
                        window_size = ws,
                        datasets=datasets,
                        output_folder=output_folder,
                        base_folder='regression_plots')
def train():
    train_data_names = ['p10a','p011']
    valid_data = ['p09b','p023','p035','p038']
    test_data = ['p09a','p033']
        
    train_reader = ICHISeqDataReader(train_data_names)
    #get data divided on sequences with respect to labels
    train_visible_seqs = train_reader.read_all_and_divide()
        
    valid_reader = ICHISeqDataReader(valid_data)
    valid_visible_seqs = valid_reader.read_all_and_divide()
    
    test_reader = ICHISeqDataReader(test_data)
    test_visible_seqs = test_reader.read_all_and_divide()
    
    print('data is got')
    
    rank = 1
    base = pow(10, rank) + 1
    n_visible_labels = pow(base, 3)
    n_visibles = [n_visible_labels] * 7
    n_hiddens = [200] * 7
    n_epochs = [1] * 7

    print('start creation')
    trained_HMM = HMM(n_visibles=n_visibles,
                      n_hiddens=n_hiddens,
                      train_seqs=train_visible_seqs,
                      n_epochs=n_epochs,
                      train_data_names=train_data_names,
                      valid_seqs = valid_visible_seqs,
                      test_seqs = test_visible_seqs)
                      
    print('Hmm created')
def test_all_params():
    window_sizes = [13]
    
    train_data = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    valid_data = ['p09b','p023','p035','p038']
    test_data = ['p09a','p033']
    
    train_reader = ICHISeqDataReader(train_data)
    train_set_x, train_set_y = train_reader.read_all()
    
    valid_reader = ICHISeqDataReader(valid_data)
    valid_set_x, valid_set_y = valid_reader.read_all()

    test_reader = ICHISeqDataReader(test_data)
    test_set_x, test_set_y = test_reader.read_all()
    
    datasets = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
            (test_set_x, test_set_y)]

    output_folder=('[%s], [%s], [%s]')%(",".join(train_data), ",".join(valid_data), ",".join(test_data))
    
    for ws in window_sizes:
        test_SdA(datasets=datasets,
                 output_folder=output_folder,
                 base_folder='SdA_cg_plots',
                 window_size=ws,
                 pretraining_epochs=100,
                 training_epochs=1000)
Example #13
0
def test_da_params(corruption_level):
    learning_rates = [0.001, 0.003, 0.005, 0.007, 0.009, 0.011, 0.013, 0.015]
    window_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    
    train_data = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    valid_data = ['p09b','p023','p035','p038']
    test_data = ['p09a','p033']
    
    train_reader = ICHISeqDataReader(train_data)
    train_set, train_labels = train_reader.read_all()
    
    valid_reader = ICHISeqDataReader(valid_data)
    valid_set, valid_labels = valid_reader.read_all()

    test_reader = ICHISeqDataReader(test_data)
    test_set, test_labels = test_reader.read_all()
    
    output_folder=('[%s], [%s], [%s]')%(",".join(train_data), ",".join(valid_data), ",".join(test_data))
    
    for lr in learning_rates:
        for ws in window_sizes:
            train_dA(learning_rate=lr,
                     training_epochs=1,
                     window_size = ws, 
                     corruption_level=corruption_level,
                     n_hidden=ws*2,
                     train_set=train_set,
                     output_folder=output_folder,
                     base_folder='dA_plots')
def test_sda(sda, test_names, rank, start_base, window_size=1, algo='viterbi'):
    test_reader = ICHISeqDataReader(test_names)
    test_set_x, test_set_y = test_reader.read_all()
    
    n_test_patients = len(test_names)
    
    for test_patient in xrange(n_test_patients):
        #get data divided on sequences with respect to labels
        test_set_x, test_set_y = test_reader.read_next_doc()
        test_set_x = test_set_x.get_value()
        test_set_y = test_set_y.eval()
        n_test_times = test_set_x.shape[0] - window_size
        
        test_visible_after_sda = numpy.array([sda.get_da_output(
                test_set_x[time: time+window_size]).ravel()
                for time in xrange(n_test_times)])
                    
        new_test_visible = create_labels_after_das(
            da_output_matrix=test_visible_after_sda,
            rank=rank,
            start_base=start_base,
            window_size=window_size
        )
        
        n_patient_samples = len(test_set_y)
        half_window_size = int(window_size/2)
        new_test_hidden=test_set_y[half_window_size:n_patient_samples-half_window_size]
        
        patient_error = get_error_on_patient(
            model=sda.hmmLayer,
            visible_set=new_test_visible,
            hidden_set=new_test_hidden,
            algo=algo
        )
        
        print(patient_error, ' error for patient ' + str(test_patient))
        gc.collect()  
def test_all_params():
    window_sizes = [1]
    
    train_data = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    valid_data = ['p09b','p023','p035','p038']
    test_data = ['p09a','p033']
    
    train_reader = ICHISeqDataReader(train_data)
    train_set_x, train_set_y = train_reader.read_all()
    
    valid_reader = ICHISeqDataReader(valid_data)
    valid_set_x, valid_set_y = valid_reader.read_all()

    test_reader = ICHISeqDataReader(test_data)
    test_set_x, test_set_y = test_reader.read_all()
    
    datasets = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
            (test_set_x, test_set_y)]

    output_folder=('[%s], [%s], [%s]')%(",".join(train_data), ",".join(valid_data), ",".join(test_data))
    corruption_levels = [.1, .2]
    pretrain_lr=.03
    
    rank = 1
    start_base=5
    base = pow(start_base, rank) + 1
    
    for ws in window_sizes:
        trained_sda = train_SdA(
                 datasets=datasets,
                 train_names=train_data,
                 output_folder=output_folder,
                 base_folder='SdA_second_hmm_without_window',
                 window_size=ws,
                 corruption_levels=corruption_levels,
                 pretrain_lr=pretrain_lr,
                 base=base,
                 pretraining_epochs=15
        )
        test_sda(sda=trained_sda,
                 test_names=test_data,
                 base = base
        )
Example #16
0
def test_all_params():
    window_sizes = [10]

    # train_data = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    train_data = ["p10a"]
    valid_data = ["p09b", "p023", "p035", "p038"]
    test_data = ["p09a", "p033"]

    train_reader = ICHISeqDataReader(train_data)
    train_set_x, train_set_y = train_reader.read_all()

    valid_reader = ICHISeqDataReader(valid_data)
    valid_set_x, valid_set_y = valid_reader.read_all()

    test_reader = ICHISeqDataReader(test_data)
    test_set_x, test_set_y = test_reader.read_all()

    datasets = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)]

    output_folder = ("[%s], [%s], [%s]") % (",".join(train_data), ",".join(valid_data), ",".join(test_data))
    corruption_levels = [0.1, 0.2]
    pretrain_lr = 0.03
    finetune_lr = 0.03

    for ws in window_sizes:
        test_SdA(
            datasets=datasets,
            output_folder=output_folder,
            base_folder="SdA_sgd_cg_plots",
            window_size=ws,
            corruption_levels=corruption_levels,
            pretrain_lr=pretrain_lr,
            finetune_lr=finetune_lr,
            pretraining_epochs=1,
            training_epochs=1,
        )
def train_SdA(train_names, valid_names,
             output_folder, base_folder,
             window_size,
             corruption_levels,
             pretraining_epochs,
             start_base,
             rank,
             pretrain_lr):
    """
    Demonstrates how to train and test a stochastic denoising autoencoder.
    This is demonstrated on ICHI.
    :type pretraining_epochs: int
    :param pretraining_epochs: number of epoch to do pretraining
    :type n_iter: int
    :param n_iter: maximal number of iterations ot run the optimizer
    :type datasets: array
    :param datasets: [train_set, valid_set, test_set]
    
    :type output_folder: string
    :param output_folder: folder for costand error graphics with results
    """

    # compute number of examples given in training set
    n_in = window_size*3  # number of input units
    n_out = 7  # number of output units
    
    # numpy random generator
    # start-snippet-3
    numpy_rng = numpy.random.RandomState(89677)
    print '... building the model'
    # construct the stacked denoising autoencoder class
    sda = SdA(
        numpy_rng=numpy_rng,
        n_ins=n_in,
        hidden_layers_sizes=[window_size*2, window_size],
        n_outs=n_out
    )
    # end-snippet-3 start-snippet-4
        
    #########################
    # PRETRAINING THE MODEL #
    #########################
    
    start_time = timeit.default_timer()
    '''
    pretrained_sda = pretrain_sda_sgd(sda=sda,
                                  train_names=train_names,
                                  window_size=window_size,
                                  pretraining_epochs=pretraining_epochs,
                                  pretrain_lr=pretrain_lr,
                                  corruption_levels=corruption_levels)
    
    '''
    pretrained_sda = pretrain_sda_cg(sda=sda,
                                  train_names=train_names,
                                  window_size=window_size,
                                  pretraining_epochs=pretraining_epochs,
                                  corruption_levels=corruption_levels)
                         
    end_time = timeit.default_timer()
    
    for i in xrange(sda.n_layers):
        print(i, 'i pretrained')
        visualize_pretraining(train_cost=pretrained_sda.dA_layers[i].train_cost_array,
                              window_size=window_size,
                              learning_rate=0,
                              corruption_level=corruption_levels[i],
                              n_hidden=sda.dA_layers[i].n_hidden,
                              da_layer=i,
                              datasets_folder=output_folder,
                              base_folder=base_folder)

    print >> sys.stderr, ('The pretraining code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    # end-snippet-4
    
    ########################
    # FINETUNING THE MODEL #
    ########################
                      
    #create matrices for params of HMM layer

    n_hiddens=[5]*n_out
    
    #create hmm container
    hmmLayer = GeneralHMM(
        n_hiddens = n_hiddens,
        n_hmms = n_out
    )
    
    #train_hmm        
    train_reader = ICHISeqDataReader(train_names)
    n_train_patients = len(train_names)
    #train hmms on data of each pattient
    for train_patient in xrange(n_train_patients):
        #get data divided on sequences with respect to labels
        train_set = train_reader.read_one_with_window(
            window_size=window_size,
            divide=True
        )
        for i in xrange(hmmLayer.n_hmms):
            cur_train_set = train_set[i].eval()
            if cur_train_set.shape[0] <= 0:
                continue
            print('train_set[i].eval(): ', train_set[i].eval().shape)
            #get (avg, disp) labels for x-values
            train_visible_after_sda = numpy.array([sda.get_da_output(
                numpy.array(cur_train_set[time]).reshape(1, -1))
                for time in xrange(cur_train_set.shape[0])])
                    
            x_labels = create_labels_after_das(
                da_output_matrix = train_visible_after_sda,
                rank=rank,
                start_base=start_base
            )
            hmmLayer.hmm_models[i].fit([numpy.array(x_labels).reshape(-1, 1)])
        
        error_cur_epoch = hmmLayer.validate_model(
            valid_names = valid_names,
            window_size = window_size,
            rank = rank,
            start_base = start_base
        )
        hmmLayer.valid_error_array.append([])
        hmmLayer.valid_error_array[-1].append(train_patient)
        hmmLayer.valid_error_array[-1].append(error_cur_epoch)
            
        gc.collect()
        
    gc.collect()
    print('MultinomialHMM created')
    
    sda.set_hmm_layer(
        hmm_model=hmmLayer
    )
    return sda
def train_all_data():
    # train_data_names = ['p10a','p011','p013','p014','p020','p022','p040','p045','p048']
    train_data_names = ["p10a"]
    valid_data = ["p09b", "p023", "p035", "p038"]
    # valid_data=['p10a']
    test_data = ["p09a", "p033"]

    n_train_patients = len(train_data_names)
    n_valid_patients = len(valid_data)
    n_test_patients = len(test_data)

    rank = 1
    start_base = 5
    base = pow(start_base, rank) + 1

    train_reader = ICHISeqDataReader(train_data_names)
    # get data divided on sequences with respect to labels
    train_set_x, train_set_y = train_reader.read_all_for_second_hmm(rank=rank, start_base=start_base)

    valid_reader = ICHISeqDataReader(valid_data)
    valid_set_x, valid_set_y = valid_reader.read_all_for_second_hmm(rank=rank, start_base=start_base)

    test_reader = ICHISeqDataReader(test_data)
    test_set_x, test_set_y = test_reader.read_all_for_second_hmm(rank=rank, start_base=start_base)

    print("data is got")

    n_visible_labels = pow(base, 3)
    n_hidden = 7
    window_size = 1

    new_train_set_x, new_train_set_y = change_data_for_ws(
        dataset=(train_set_x, train_set_y),
        window_size=window_size,
        base_for_labels=n_visible_labels,
        n_patients=n_train_patients,
    )

    new_valid_set_x, new_valid_set_y = change_data_for_ws(
        dataset=(valid_set_x, valid_set_y),
        window_size=window_size,
        base_for_labels=n_visible_labels,
        n_patients=n_valid_patients,
    )

    new_test_set_x, new_test_set_y = change_data_for_ws(
        dataset=(test_set_x, test_set_y),
        window_size=window_size,
        base_for_labels=n_visible_labels,
        n_patients=n_test_patients,
    )

    trained_HMM = create_hmm_for_all_data(
        n_hidden=n_hidden,
        n_visible=pow(n_visible_labels, window_size),
        train_set=(new_train_set_x, new_train_set_y),
        n_patients=n_train_patients,
        window_size=window_size,
    )

    gc.collect()
    print("Hmm created")
    get_error_on_model(
        model=trained_HMM, n_patients=n_valid_patients, test_set=(new_valid_set_x, new_valid_set_y), window_size=1
    )
def train_separately():
    train_data_names = [
        "p10a",
        "p011",
        "p013",
        "p014",
        "p020",
        "p022",
        "p040",
        "p045",
        "p048",
        "p09b",
        "p023",
        "p035",
        "p038",
        "p09a",
        "p033",
    ]
    valid_data = ["p09b", "p023", "p035", "p038", "p09a", "p033"]

    n_train_patients = len(train_data_names)
    n_valid_patients = len(valid_data)

    rank = 1
    start_base = 5
    base = pow(start_base, rank) + 1
    n_visible_labels = pow(base, 3)
    window_size = 1
    n_visible = pow(n_visible_labels, window_size)
    n_hidden = 7

    train_reader = ICHISeqDataReader(train_data_names)
    valid_reader = ICHISeqDataReader(valid_data)

    pi_values = numpy.zeros((n_hidden,))
    a_values = numpy.zeros((n_hidden, n_hidden))
    b_values = numpy.zeros((n_hidden, n_visible))
    array_from_hidden = numpy.zeros((n_hidden,))

    for train_patient in xrange(n_train_patients):
        # get data divided on sequences with respect to labels
        train_set_x, train_set_y = train_reader.read_doc_for_second_hmm(rank=rank, start_base=start_base)

        new_train_visible, new_train_hidden = change_data_for_one_patient(
            hiddens_patient=train_set_y.eval(),
            visibles_patient=train_set_x.eval(),
            window_size=window_size,
            base_for_labels=n_visible_labels,
        )

        pi_values, a_values, b_values, array_from_hidden = update_params_on_patient(
            pi_values=pi_values,
            a_values=a_values,
            b_values=b_values,
            array_from_hidden=array_from_hidden,
            hiddens_patient=new_train_hidden,
            visibles_patient=new_train_visible,
            n_hidden=n_hidden,
        )

        gc.collect()

    pi_values, a_values, b_values = finish_training(
        pi_values=pi_values,
        a_values=a_values,
        b_values=b_values,
        array_from_hidden=array_from_hidden,
        n_hidden=n_hidden,
        n_patients=n_train_patients,
    )

    hmm_model = hmm.MultinomialHMM(n_components=n_hidden, startprob=pi_values, transmat=a_values)
    hmm_model.n_symbols = n_visible
    hmm_model.emissionprob_ = b_values
    gc.collect()
    print("MultinomialHMM created")
    algo = "viterbi"

    for valid_patient in xrange(n_valid_patients):
        # get data divided on sequences with respect to labels
        valid_set_x, valid_set_y = valid_reader.read_doc_for_second_hmm(rank=rank, start_base=start_base)

        new_valid_visible, new_valid_hidden = change_data_for_one_patient(
            hiddens_patient=valid_set_y.eval(),
            visibles_patient=valid_set_x.eval(),
            window_size=window_size,
            base_for_labels=n_visible_labels,
        )

        patient_error = get_error_on_patient(
            model=hmm_model, visible_set=new_valid_visible, hidden_set=new_valid_hidden, algo=algo
        )

        print(patient_error, " error for patient " + str(valid_patient))
        gc.collect()
def train_SdA(datasets, train_names,
             output_folder, base_folder,
             window_size,
             corruption_levels,
             pretraining_epochs,
             base,
             pretrain_lr=0):
    """
    Demonstrates how to train and test a stochastic denoising autoencoder.
    This is demonstrated on ICHI.
    :type pretraining_epochs: int
    :param pretraining_epochs: number of epoch to do pretraining
    :type n_iter: int
    :param n_iter: maximal number of iterations ot run the optimizer
    :type datasets: array
    :param datasets: [train_set, valid_set, test_set]
    
    :type output_folder: string
    :param output_folder: folder for costand error graphics with results
    """

    # split the datasets
    (train_set_x, train_set_y) = datasets[0]
    (valid_set_x, valid_set_y) = datasets[1]
    (test_set_x, test_set_y) = datasets[2]

    # compute number of examples given in training set
    n_in = window_size*3  # number of input units
    n_out = 7  # number of output units
    
    # numpy random generator
    # start-snippet-3
    numpy_rng = numpy.random.RandomState(89677)
    print '... building the model'
    # construct the stacked denoising autoencoder class
    sda = SdA(
        numpy_rng=numpy_rng,
        n_ins=n_in,
        hidden_layers_sizes=[window_size*2, window_size],
        n_outs=n_out
    )
    # end-snippet-3 start-snippet-4
        
    #########################
    # PRETRAINING THE MODEL #
    #########################
    
    start_time = timeit.default_timer()
    
    pretrained_sda = pretrain_sda_sgd(sda=sda,
                                  train_names=train_names,
                                  window_size=window_size,
                                  pretraining_epochs=pretraining_epochs,
                                  pretrain_lr=pretrain_lr,
                                  corruption_levels=corruption_levels)
    '''

    pretrained_sda = pretrain_sda_cg(sda=sda,
                                  train_set_x=train_set_x,
                                  window_size=window_size,
                                  pretraining_epochs=pretraining_epochs,
                                  corruption_levels=corruption_levels)
    '''                       
    end_time = timeit.default_timer()
    
    for i in xrange(sda.n_layers):
        print(i, 'i pretrained')
        visualize_pretraining(train_cost=pretrained_sda.dA_layers[i].train_cost_array,
                              window_size=window_size,
                              learning_rate=0,
                              corruption_level=corruption_levels[i],
                              n_hidden=sda.dA_layers[i].n_hidden,
                              da_layer=i,
                              datasets_folder=output_folder,
                              base_folder=base_folder)

    print >> sys.stderr, ('The pretraining code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    # end-snippet-4
    ########################
    # FINETUNING THE MODEL #
    ########################
                          
    #create matrices for params of HMM layer
    train_data_names = ['p10a','p011','p013','p014','p020','p022','p040',
                        'p045','p048','p09b','p023','p035','p038', 'p09a','p033']

    n_train_patients=len(train_data_names)
    
    n_visible=pow(base, sda.da_layers_output_size)
    n_hidden=n_out
        
    train_reader = ICHISeqDataReader(train_data_names)
    
    pi_values = numpy.zeros((n_hidden,))
    a_values = numpy.zeros((n_hidden, n_hidden)) 
    b_values = numpy.zeros((n_hidden, n_visible))
    array_from_hidden = numpy.zeros((n_hidden,))

    for train_patient in xrange(n_train_patients):
        #get data divided on sequences with respect to labels
        train_set_x, train_set_y = train_reader.read_next_doc()
        train_x_array = train_set_x.get_value()
        n_train_times = train_x_array.shape[0] - window_size + 1
        train_visible_after_sda = numpy.array([sda.get_da_output(
                train_x_array[time: time+window_size]).ravel()
                for time in xrange(n_train_times)]).ravel()
                            
        new_train_visible, new_train_hidden = change_data_for_one_patient(
            hiddens_patient=train_set_y.eval(),
            visibles_patient=train_visible_after_sda,
            window_size=sda.da_layers_output_size,
            base_for_labels=base
        )
        
        pi_values, a_values, b_values, array_from_hidden = update_params_on_patient(
            pi_values=pi_values,
            a_values=a_values,
            b_values=b_values,
            array_from_hidden=array_from_hidden,
            hiddens_patient=new_train_hidden,
            visibles_patient=new_train_visible,
            n_hidden=n_hidden
        )
        
        gc.collect()
        
    pi_values, a_values, b_values = finish_training(
        pi_values=pi_values,
        a_values=a_values,
        b_values=b_values,
        array_from_hidden=array_from_hidden,
        n_hidden=n_hidden,
        n_patients=n_train_patients
    )
    
    hmm_model = hmm.MultinomialHMM(
        n_components=n_hidden,
        startprob=pi_values,
        transmat=a_values
    )
    
    hmm_model.n_symbols=n_visible
    hmm_model.emissionprob_=b_values 
    gc.collect()
    print('MultinomialHMM created')
    
    sda.set_hmm_layer(
        hmm_model=hmm_model
    )
    return sda
def train_separately():
    train_data_names = ['p10a','p011','p013','p014','p020','p022','p040',
                        'p045','p048','p09b','p023','p035','p038', 'p09a','p033']
    valid_data = ['p09b','p023','p035','p038', 'p09a','p033']

    n_train_patients=len(train_data_names)
    n_valid_patients=len(valid_data)
    
    rank = 1
    start_base=10
    base = pow(start_base, rank) + 1
    window_size = 1
    n_visible=pow(base, 6)
    n_hidden=7
        
    train_reader = ICHISeqDataReader(train_data_names)
    valid_reader = ICHISeqDataReader(valid_data)
    
    pi_values = numpy.zeros((n_hidden,))
    a_values = numpy.zeros((n_hidden, n_hidden)) 
    b_values = numpy.zeros((n_hidden, n_visible))
    array_from_hidden = numpy.zeros((n_hidden,))

    for train_patient in xrange(n_train_patients):
        #get data divided on sequences with respect to labels
        train_set_x, train_set_y = train_reader.read_doc_with_av_disp(
            rank=rank,
            start_base=start_base,
            window_size=window_size
        )
        
        pi_values, a_values, b_values, array_from_hidden = update_params_on_patient(
            pi_values=pi_values,
            a_values=a_values,
            b_values=b_values,
            array_from_hidden=array_from_hidden,
            hiddens_patient=train_set_y.eval(),
            visibles_patient=train_set_x.eval(),
            n_hidden=n_hidden
        )
        
        gc.collect()
        
    pi_values, a_values, b_values = finish_training(
        pi_values=pi_values,
        a_values=a_values,
        b_values=b_values,
        array_from_hidden=array_from_hidden,
        n_hidden=n_hidden,
        n_patients=n_train_patients
    )
    
    hmm_model = hmm.MultinomialHMM(
        n_components=n_hidden,
        startprob=pi_values,
        transmat=a_values
    )
    hmm_model.n_symbols=n_visible
    hmm_model.emissionprob_=b_values 
    gc.collect()
    print('MultinomialHMM created')
    algo='viterbi'

    for valid_patient in xrange(n_valid_patients):
        #get data divided on sequences with respect to labels
        valid_set_x, valid_set_y = valid_reader.read_doc_with_av_disp(
            rank=rank,
            start_base=start_base,
            window_size=window_size
        )
        
        patient_error = get_error_on_patient(
            model=hmm_model,
            visible_set=valid_set_x.eval(),
            hidden_set=valid_set_y.eval(),
            algo=algo
        )
        
        print(patient_error, ' error for patient ' + str(valid_patient))

        gc.collect()