Python get_bag_of_words_matrix 예제들, DataPreparation.data_processing.get_bag_of_words_matrix Python 예제들

예제 #1

0

파일 보기

파일: finetuning.py 프로젝트: irscut/IR-SemanticHashing-Python

def generate_large_batch_parallel(args):
    batches, large_batch_size = args
    large_batches_lst = []
    x = None
    for batch in batches:
        # Append input data.
        x_tmp = get_bag_of_words_matrix(batch)
        if x is None:
            x = x_tmp
        else:
            x = append(x, x_tmp, axis=0)

        if len(
                x
        ) == large_batch_size and batches[-1] - batch >= large_batch_size:
            large_batches_lst.append(int(batch))
            save_large_batch(int(batch), x)
            x = None
        elif len(x) > large_batch_size and batch == batches[-1]:
            large_batches_lst.append(int(batch))
            save_large_batch(int(batch), x)
            x = None
        elif batch == batches[-1]:
            large_batches_lst.append(int(batch))
            save_large_batch(int(batch), x)
            x = None

예제 #2

0

파일 보기

파일: visualise.py 프로젝트: vseledkin/Deep-Belief-Nets-for-Topic-Modeling

 def __init__(self,trainingdata = False):
     self.batches = data_processing.get_batch_list(trainingdata)        
     
     # Run data through neural network
     self.lower_dimension_data = [] # Output data from the dbn
     self.higher_dimensional_data = [] # Input data to the dbn
     
     
     self.path = 'output'
     if not os.path.exists(self.path):
         os.makedirs(self.path)
     
     weights = rsm.get_weights()
     visible_biases = rsm.get_visible_biases()
     hidden_biases = rsm.get_hidden_biases()
     
     # Generate class indices and class names
     if trainingdata:
         path = 'pickle/train/bag_of_words'
     else:
         path = 'pickle/test/bag_of_words'
     
     self.class_indices = self.__generate_class_indices__(path, self.batches) # Class indices for all documents
     
     # Run through batches and generate high and low dimensional data lists
     for batch in range(len(self.batches)):
         print 'Batch ',batch + 1, ' of ',len(self.batches)
         d = data_processing.get_bag_of_words_matrix(self.batches[batch],trainingdata)
         self.higher_dimensional_data += list(d)
         self.lower_dimension_data += list((rsm.generate_output_data(d, weights,visible_biases,hidden_biases)))

예제 #3

0

파일 보기

파일: image_visualization.py 프로젝트: vseledkin/Deep-Belief-Nets-for-Topic-Modeling

def compare_real_data_to_reconstructed_data():
    weights = s.load(open(env_paths.get_dbn_weight_path(),"rb"))
    batches = s.load(open(env_paths.get_batches_path(train=False),"rb"))
    class_indices = s.load(open(env_paths.get_class_indices_path(False,batches[0]).replace(".0",""),"rb"))
    batch = batches[0]
    data = data_processing.get_bag_of_words_matrix(batch,training = False)


    dict = {}
    for i in range(len(class_indices)):
        idx = class_indices[i]
        if idx in dict.keys(): continue
        dict[idx] = data[i]
        if len(dict) >= 10:
            break

    print dict.keys()

    data_points = dict.values()

    output_data_points = []
    for d in data_points:
        d = append(d,1.)
        out = generate_output_data(d,weights)
        output_data_points.append(out)

    visualise_data_points(data_points,output_data_points)

예제 #4

0

파일 보기

파일: rsm.py 프로젝트: vseledkin/Deep-Belief-Nets-for-Topic-Modeling

 def rsm_learn(self,epochs):
     '''
     Learning method for the replicated softmax.
     The higher value of epochs will result in
     more training.
     
     Parameters
     ----------
     epochs: The number of epochs.
     '''
     
     for epoch in range(epochs):
         errsum = 0
         batch_index = 0
         for _ in self.batches:
             
             # Positive phase - generate data from visible to hidden units.
             pos_vis = data_processing.get_bag_of_words_matrix(self.batches[batch_index])
             D = sum(pos_vis,axis = 1)
             batch_size = len(pos_vis)
             
             #pos_hid_prob = (1+sp.tanh((dot(pos_vis,self.weights)+outer(D, self.hidden_biases))/2))/2
             pos_hid_prob = sigmoid(dot(pos_vis,self.weights)+outer(D, self.hidden_biases))        
             
             # If probabilities are higher than randomly generated, the states are 1 
             randoms = rand.rand(batch_size,self.num_hid)
             pos_hid = array(randoms < pos_hid_prob,dtype = int)
             
             # Negative phase - generate data from hidden to visible units and then again to hidden units.                
             neg_vis = dot(pos_hid,self.weights.T)+self.visible_biases
             tmp = exp(neg_vis)
             s = tmp.sum(axis = 1)
             s = s.reshape((batch_size,1))
             neg_vis_pdf = tmp/s
             
             neg_vis *= 0
             for i in xrange(batch_size):
                 neg_vis[i] = random.multinomial(D[i],neg_vis_pdf[i],size = 1)
                     
                                    
             neg_hid_prob = sigmoid(dot(neg_vis,self.weights)+outer(D,self.hidden_biases))
             #neg_hid_prob = (1+sp.tanh((dot(neg_vis,self.weights)+outer(D,self.hidden_biases))/2))/2
             
             # Set the error
             errsum += sum(((pos_vis)-neg_vis)**2)
             
             self.delta_weights = self.delta_weights*self.momentum + dot(pos_vis.T, pos_hid_prob) - dot(neg_vis.T, neg_hid_prob)
             self.delta_visible_biases = self.delta_visible_biases * self.momentum + pos_vis.sum(axis = 0) - neg_vis.sum(axis = 0)
             self.delta_hidden_biases = self.delta_hidden_biases * self.momentum + pos_hid_prob.sum(axis = 0) - neg_hid_prob.sum(axis = 0)
             
             self.weights += self.delta_weights * (self.epsilon_weights/batch_size)
             self.visible_biases += self.delta_visible_biases * (self.epsilon_visibleBiases/batch_size)
             self.hidden_biases += self.delta_hidden_biases * (self.epsilon_hiddenBiases/batch_size)
             
             batch_index += 1
         print 'Epoch ',epoch+1,' Error ',errsum/batch_size
     
     self.__save_rsm__()

예제 #5

0

파일 보기

파일: finetuning_image_data.py 프로젝트: vseledkin/Deep-Belief-Nets-for-Topic-Modeling

def error(args):
    """
    Compute the training or testing error on the unfolded network.
    """
    weights,epoch,training,batches,queue = args
    err = 0
    for batch in range(len(batches)):
        x = get_bag_of_words_matrix(batches[batch]) if training else get_bag_of_words_matrix(batches[batch],training = False)
        x = append(x,ones((len(x),1)),axis = 1)
        xout,_ = generate_output_data(x, weights)
        err += sum((x[:,:-1]-xout)**2)

    if training:
        out = 'Train error before epoch['+str(epoch+1)+']: '+str(err/(len(batches)))
    else:
        out = 'Test error before epoch['+str(epoch+1)+']: '+str(err/(len(batches)))

    queue.put([training,out,err/(len(batches))])

예제 #6

0

파일 보기

파일: finetuning.py 프로젝트: Akhiltas/deep-belief-nets-for-topic-modeling

def error(args):
    """
    Compute the training or testing error on the unfolded network.
    """
    weights, epoch, training, batches, queue, binary_output = args
    err = 0
    for batch in range(len(batches)):
        x = get_bag_of_words_matrix(batches[batch]) if training else get_bag_of_words_matrix(batches[batch],
                                                                                             training=False)
        x = append(x, ones((len(x), 1)), axis=1)
        xout, _ = generate_output_data(x, weights, binary_output=binary_output)
        x[:, :-1] = get_norm_x(x[:, :-1])
        err -= sum(x[:, :-1] * log(xout))

    if training:
        out = 'Train error before epoch[%i]: %.2f' % (epoch + 1, err / len(batches))
    else:
        out = 'Test error before epoch[%i]: %.2f' % (epoch + 1, err / len(batches))

    queue.put([training, out, err / (len(batches))])

예제 #7

0

파일 보기

파일: finetuning.py 프로젝트: irscut/IR-SemanticHashing-Python

def error(args):
    """
    Compute the training or testing error on the unfolded network.
    """
    weights, epoch, training, batches, queue, binary_output = args
    err = 0
    for batch in range(len(batches)):
        x = get_bag_of_words_matrix(
            batches[batch]) if training else get_bag_of_words_matrix(
                batches[batch], training=False)
        x = append(x, ones((len(x), 1)), axis=1)
        xout, _ = generate_output_data(x, weights, binary_output=binary_output)
        x[:, :-1] = get_norm_x(x[:, :-1])
        err -= sum(x[:, :-1] * log(xout))

    if training:
        out = 'Train error before epoch[%i]: %.2f' % (epoch + 1,
                                                      err / len(batches))
    else:
        out = 'Test error before epoch[%i]: %.2f' % (epoch + 1,
                                                     err / len(batches))

    queue.put([training, out, err / (len(batches))])

예제 #8

0

파일 보기

파일: dbn.py 프로젝트: vseledkin/Deep-Belief-Nets-for-Topic-Modeling

def generate_input_data_list(training = True):
    """
    Generate a list of all input data.

    @param training: If training is True, the input should be generated for training data and vice versa.
    """
    batches = data_processing.get_batch_list(training = training)
    input_data = []

    for batch in range(len(batches)):
        print 'Batch ',batch + 1, ' of ',len(batches)
        d = data_processing.get_bag_of_words_matrix(batches[batch],training=training)
        d = get_norm_x(d)
        input_data += list(d)

    return input_data

예제 #9

0

파일 보기

파일: dbn.py 프로젝트: irscut/IR-SemanticHashing-Python

def generate_input_data_list(training=True):
    """
    Generate a list of all input data.

    @param training: If training is True, the input should be generated for training data and vice versa.
    """
    batches = data_processing.get_batch_list(training=training)
    input_data = []

    for batch in range(len(batches)):
        print 'Batch ', batch + 1, ' of ', len(batches)
        d = data_processing.get_bag_of_words_matrix(batches[batch],
                                                    training=training)
        d = get_norm_x(d)
        input_data += list(d)

    return input_data

예제 #10

0

파일 보기

파일: image_visualization.py 프로젝트: vseledkin/Deep-Belief-Nets-for-Topic-Modeling

def compare_real_data_to_reconstructed_data_random():
    weights = s.load(open(env_paths.get_dbn_weight_path(),"rb"))
    batches = s.load(open(env_paths.get_batches_path(train=False),"rb"))
    batch = choice(batches) # make sure to pick batch at random
    data = data_processing.get_bag_of_words_matrix(batch,training = False)
    # choose 10 data points at random
    data_points = []
    indices = random.randint(0,len(data),10)
    for idx in indices:
        data_points.append(data[idx])

    output_data_points = []
    for d in data_points:
        d = append(d,1.)
        out = generate_output_data(d,weights)
        output_data_points.append(out)

    visualise_data_points(data_points,output_data_points)

예제 #11

0

파일 보기

파일: finetuning.py 프로젝트: Akhiltas/deep-belief-nets-for-topic-modeling

def generate_large_batch_parallel(args):
    batches, large_batch_size = args
    large_batches_lst = []
    x = None
    for batch in batches:
        # Append input data.
        x_tmp = get_bag_of_words_matrix(batch)
        if x is None:
            x = x_tmp
        else:
            x = append(x, x_tmp, axis=0)

        if len(x) == large_batch_size and batches[-1] - batch >= large_batch_size:
            large_batches_lst.append(int(batch))
            save_large_batch(int(batch), x)
            x = None
        elif len(x) > large_batch_size and batch == batches[-1]:
            large_batches_lst.append(int(batch))
            save_large_batch(int(batch), x)
            x = None
        elif batch == batches[-1]:
            large_batches_lst.append(int(batch))
            save_large_batch(int(batch), x)
            x = None

예제 #12

0

파일 보기

파일: dbn.py 프로젝트: irscut/IR-SemanticHashing-Python

def __generate_output_for_train_data_par(args):
    batch, weight_matrices_added_biases, binary_output = args
    d = data_processing.get_bag_of_words_matrix(batch, training=True)
    return list((generate_output_data(d,
                                      weight_matrices_added_biases,
                                      binary_output=binary_output)))

예제 #13

0

파일 보기

파일: dbn.py 프로젝트: vseledkin/Deep-Belief-Nets-for-Topic-Modeling

def __generate_output_for_train_data_par(args):
    batch,weight_matrices_added_biases,binary_output = args
    d = data_processing.get_bag_of_words_matrix(batch,training = True)
    return list((generate_output_data(d, weight_matrices_added_biases,binary_output=binary_output)))