コード例 #1
0
def main():
    data = DataHelper("data/Twitter.csv")
    # data.MinMaxScale()
    rp = Representation(data.getdata())
    # rp.cal_mean_by_class()
    # rp.cal_cosdist_by_eigenvect()
    _ = rp.cal_KLdivergence()
コード例 #2
0
def run_blstm(
    dim_proj=256,  # LSTM number of hidden units.
    dim_frame=4096,  # feature dimension of image frame in the video
    output_dim=4096,
    v_length=24,  # video length or number of frames
    max_iter=100,  # The maximum number of epoch to run
    l2_decay=0.0001,  # Weight decay for model params.
    lrate=0.0001,  # Learning rate for SGD, Adam
    lamb=0.2,
    optimizer='SGD',  # SGD, Adam available
    saveto='pairwise-blstm_model.npz',  # The best model will be saved there
    dispFreq=2,  # Display to stdout the training progress every N updates
    validFreq=20,  # Compute the validation error after this number of update.
    saveFreq=2,  # Save the parameters after every saveFreq updates
    batch_size=256,  # The batch size during training.
    valid_batch_size=20,  # The batch size used for validation/test set.
    test_batch_size=1024,
    weights=[
        1. / 3., 1. / 3., 1. / 3.
    ],  # The Weights for forwoad and backward reconstruction and mean value reconstruction
    pairwise_weight=0.999,
    reload_model=False,  # If reload model from saveto.
    is_train=False,
    test_step=1,
    hiera_step=2,
    train_data_file_path='/mnt/data2/lixiangpeng/dataset/features/FCV/fcv/',
    test_data_file_path='/mnt/data2/lixiangpeng/dataset/features/FCV/fcv/',
    #train_data_file_path = './',
    #test_data_file_path = './',
    train_splits_num=1,
    test_splits_num=1,
    record_path='./records.txt',
    SS_path='/mnt/data2/lixiangpeng/dataset/features/FCV/SimilarityInfo/Sim_K1_10_K2_5_fcv.pkl'
):
    model_options = locals().copy()
    if reload_model:
        print "Reloading model options"
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)
    print "model options", model_options

    test_data = DataHelper.DataHelper(model_options['v_length'],
                                      model_options['valid_batch_size'],
                                      model_options['dim_frame'],
                                      data_file='./data/fcv_test_demo.h5',
                                      train=False)

    model = build_model(model_options)

    if reload_model:
        model.reload_params(saveto)

    model.compile(model_options)

    if is_train:
        model.train(model_options['train_data_file_path'], test_data,
                    model_options)
    else:
        model.test(test_data, model_options)
コード例 #3
0
def main():
    data = DataHelper('data/Twitter.csv')
    data.MinMaxScale()
    data = data.getdata()
    traindata = data[:4]
    # traindata = oversamplingSMOTE(traindata,read=True)
    traindata = np.concatenate(traindata)
    testdata = data[4:]
    # method = 'KNN'         # run abour 40s   acc 96.75%  f-measure 0.65
    # method = 'SVM'         # run about 30s   acc 95%     f-measure overfit(nan)
    # method = 'Random Forest' # run about 0.44s acc 98.28%  f-measure 0.79
    method = 'XGBoost'
    start = time.time()
    clf = train(method, traindata)
    acc = []
    imbalanced = []
    MDDT = [[5, 9], [11, 14], [16, 18], [25, 29], [31, 37]]
    CUSUM = [[13, 16], [23, 24], [25, 26], [28, 30], [35, 38]]
    PH = [[7, 8], [13, 14], [25, 26]]
    for i in range(len(testdata) // 4):
        print(i)
        last_i = 0
        testbatch = np.concatenate(testdata[i * 4:(i + 1) * 4])
        try:
            acc.append(test(clf, testbatch, criteria='acc'))
            imbalanced.append(test(clf, testbatch,
                                   criteria='confusion matrix'))
        except:
            acc.append(0)
            imbalanced.append([0, 0, 0])
        testbatch = np.concatenate(testdata[i * 4:(i + 1) * 4])
        clf.fit(testbatch[:, :-1], testbatch[:, -1])
        for t in MDDT:
            start = np.floor(t[0] / 4.0) - 1
            end = np.ceil(t[1] / 4.0) - 1
            if start == i:
                try:
                    testbatch = np.concatenate(testdata[t[0] - 4:t[1] - 4])
                    clf.fit(testbatch[:, :-1], testbatch[:, -1])
                except:
                    continue
                break
    df = pd.DataFrame(acc)
    # df.to_csv("MDDT_ACC_XGB.csv",header=None,index=None)
    df = pd.DataFrame(imbalanced)
    # df.to_csv("MDDT_CONFUSION_XGB.csv",header=None,index=None)
    print("time:", time.time() - start)
コード例 #4
0
    def parallelize(self):
        try:
            for index, pair in enumerate(self.index_pairs):
                proc = Process(target=DataHelper.DataHelper(self.queue, self.xmls[pair[0] : pair[1] + 1]).run, args=(pair[0], pair[1]))
                self.procs.append(proc)
                proc.start()

            for proc in self.procs:
                proc.join()

        except KeyboardInterrupt:
            # Wait for all processes to terminate
            while True:
                alive_proc = False
                for proc in self.procs:
                    if proc.is_alive() == True:
                        alive_proc = True
                if alive_proc == False:
                    break

            print("\nKEYBOARD INTERRUPT !!\n")

            # Save checkpoint ...
            check_message = ''
            pairs = [self.queue.get() for i in range(core_number)]
            pairs.sort()

            for pair in pairs:
                check_message += str(pair[0]) + ' / ' + str(pair[1]) + ' , '
                self.progress += int(pair[2])
            check_message += str(self.progress) + ' / ' + str(self.total_number) + '\n'
            print('{}/{} : {:.2f}% Complete..\n'.format(self.progress, self.total_number, self.progress / self.total_number * 100))

            self.file_checkpoint.write(check_message)
            self.file_checkpoint.close()
            return

        print('Processing of {} xml data was completed and stored in the database !!'.format(self.total_number))
        self.file_checkpoint.close()
コード例 #5
0
import tensorflow as tf
import numpy as np
from DataHelper import *
from Model import *

data_helper = DataHelper(pos_data_file="rt-polaritydata/rt-polarity.pos",
                         neg_data_file="rt-polaritydata/rt-polarity.neg",
                         training_data_proportion=.9)

with tf.Session() as session:
    model = Model(sent_len=data_helper.max_sent_len,
                  category_num=2,
                  vocab_size=data_helper.vocab_size,
                  emb_dim=300,
                  filter_widths=[3, 4, 5, 6, 7],
                  filter_num=100,
                  l2_reg_lambda=100.)
    model.train(session=session,
                data_helper=data_helper,
                dropout_keep_prob=.5,
                max_norm_constraint=3.,
                batch_size=50,
                epoch_num=50,
                step_num_between_validations=50)
コード例 #6
0
def test(_encoder, options, uidx):
    print 'loading test data...'
    hashcode_array = Array()
    h_array = Array()
    label_array = Array()
    lines_num = 0
    for i in range(1, options['test_splits_num'] + 1):
        file_name = 'fcv_test_feats.h5'
        labels_name = 'fcv_test_labels.mat'
        print 'loading ', file_name
        test_data = DataHelper.DataHelper(
            options['v_length'],
            options['batch_size'],
            options['dim_frame'],
            data_file=options['test_data_file_path'] + file_name,
            train=True)
        labels = sio.loadmat(options['test_data_file_path'] +
                             labels_name)['labels']
        lines_num += test_data.data_size_
        if i == 1:
            label_array.setmatrcs(labels)
        else:
            label_array.concate_v(labels)
        print 'data_size: ', test_data.data_size_
        print 'batch_size: ', test_data.batch_size_

        batch_num = test_data.data_size_ / options['test_batch_size']
        if test_data.data_size_ % options['test_batch_size'] == 0:
            batch_num = batch_num
        else:
            batch_num += 1

        for batch_idx in range(batch_num):
            print 'batch_idx: ', batch_idx
            time1 = time.time()
            if batch_idx == (batch_num - 1):
                X = test_data.data_[
                    batch_idx *
                    options['test_batch_size']:][:, :options['v_length'], :]
                X = np.row_stack(
                    (X,
                     np.float32(
                         np.zeros(
                             (options['test_batch_size'] - X.shape[0],
                              options['v_length'], options['dim_frame'])))))
            else:
                X = test_data.data_[
                    batch_idx * options['test_batch_size']:(batch_idx + 1) *
                    options['test_batch_size']][:, :options['v_length'], :]
            time2 = time.time()
            print 'fetching data costs: ', time2 - time1
            print 'batch data shape: ', X.shape
            my_H = _encoder(
                X, np.zeros((X.shape[0], options['dim_proj']),
                            dtype=np.float32),
                np.zeros((X.shape[0], options['dim_proj']), dtype=np.float32),
                np.zeros((X.shape[0], options['dim_proj']), dtype=np.float32),
                np.zeros((X.shape[0], options['dim_proj']), dtype=np.float32))
            time3 = time.time()
            print 'forward costs: ', time3 - time2
            print 'my_H: ', my_H.shape
            BinaryCode = sign(my_H)
            if i == 1 and batch_idx == 0:
                hashcode_array.setmatrcs(BinaryCode)
                h_array.setmatrcs(my_H)
            else:
                hashcode_array.concate_v(BinaryCode)
                h_array.concate_v(my_H)

        hashcode_array.setmatrcs(hashcode_array.getmatrics()[:lines_num])
        h_array.setmatrcs(hashcode_array.getmatrics()[:lines_num])
        print 'hashcode shape:', hashcode_array.getmatrics().shape
    #sio.savemat(str(options['dim_proj'])+'_'+'hashcode_' + str(uidx) + '.mat', {'hashcode': hashcode_array.getmatrics()})
    #sio.savemat(str(options['dim_proj'])+'_'+'h_' + str(uidx) + '.mat', {'h': h_array.getmatrics()})

    test_hashcode = hashcode_array.getmatrics()
    print 'test_hashcode: ', test_hashcode.shape

    test_hashcode = np.matrix(test_hashcode)
    time1 = time.time()
    Hamming_distance = 0.5 * (-np.dot(test_hashcode, test_hashcode.transpose())
                              + options['dim_proj'])
    time2 = time.time()
    print 'hamming distance computation costs: ', time2 - time1
    HammingRank = np.argsort(Hamming_distance, axis=0)
    time3 = time.time()
    print 'hamming ranking costs: ', time3 - time2

    labels = label_array.getmatrics()
    print 'labels shape: ', labels.shape
    sim_labels = np.dot(labels, labels.transpose())
    time6 = time.time()
    print 'similarity labels generation costs: ', time6 - time3

    records = open('map.txt', 'w+')
    maps = []
    for i in range(5, 105, 5):
        map = tools.mAP(sim_labels, HammingRank, i)
        maps.append(map)
        records.write('epoch: ' + str(uidx) + '\ttopK: ' + str(i) + '\tmap: ' +
                      str(map) + '\n')
        print 'i: ', i, ' map: ', map, '\n'
    time7 = time.time()
    records.close()
    print 'computing processing costs: ', time7 - time6

    return maps
コード例 #7
0
pos_len = len(pos)
neg_len = len(neg)
length = pos_len + neg_len
labels = [[1, 0]] * pos_len
labels += [[0, 1]] * neg_len
labels = np.array(labels)
data = np.concatenate((pos, neg), axis=0)
# shuffle data
indice = np.random.permutation(length)
data = data[indice]
labels = labels[indice]
test_data = data[0.9 * length:]
test_labels = labels[0.9 * length:]
data = data[:0.9 * length]
labels = labels[:0.9 * length]
d = DataHelper(50, data.shape[0], data, word_vec, word_indice, labels)
t_d = DataHelper(50, test_data.shape[0], test_data, word_vec, word_indice,
                 test_labels)
optm = tf.train.AdamOptimizer(1e-3)
train_ops = optm.apply_gradients(optm.compute_gradients(cross_entropy))

sess = tf.Session()
sess.run(tf.initialize_all_variables())


def train_step(x, y):
    feed_dicts = {x_placeholder: x, y_placeholder: y, prob: 0.5}
    loss = sess.run([train_ops], feed_dict=feed_dicts)


with sess.as_default():
コード例 #8
0
    
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]
        
        # Get the dropout operation from the graph bt name
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
        
        # Tensors we want to evaluate
        prediction_op = graph.get_operation_by_name("output/predictions").outputs[0]
              
        test_dm = DataHelper(test_data)
        
        # Generate padding dataframe and padding to first sentence 
        padding = pd.DataFrame(columns = ['sentence'], index = range(FLAGS.history_size1))
        for i in range(FLAGS.history_size1 + FLAGS.history_size2):
            padding['sentence'][i]=np.array([0] * max_sentence_length)
        padding = padding['sentence']
        unshuffled_test_data = test_dm.get_test_contents()
        
        # Collect the predictions here
        all_predictions = []

        for x_test in unshuffled_test_data:   
            x_test = padding.append(x_test, ignore_index=True)  
            for i in range(len(x_test)-FLAGS.history_size1-1):
                x_test_batch = x_test[i:i+FLAGS.history_size1+FLAGS.history_size2+1].values.tolist()
コード例 #9
0
            summaries, loss, accuracy, precision, recall = sess.run([
                dev_summary_op, model.loss, model.accuracy, dev_precision_op,
                dev_recall_op
            ], feed_dict)

            if validation_step % 1000 == 0:
                time_str = datetime.datetime.now().isoformat()
                print(
                    "dev> {}: step {}, loss {:g}, acc {:g}, pre {:g}, recall {:g}"
                    .format(time_str, validation_step, loss, accuracy,
                            precision, recall))

            if writer:
                writer.add_summary(summaries, validation_step)

        train_dm = DataHelper(whole_train_data)
        dev_dm = DataHelper(whole_dev_data)

        #zero padding for the first sentence that does not have history sentences
        padding = pd.DataFrame(columns=['sentence'],
                               index=range(FLAGS.history_size1))
        for i in range(FLAGS.history_size1 + FLAGS.history_size2):
            padding['sentence'][i] = np.array([0] * max_sentence_length)
        padding = padding['sentence']

        validation_step = 0
        #generate batches
        for epoch_i in range(FLAGS.num_epochs):
            train_data = train_dm.get_contents(shuffle=True)
            for x_batch, y_batch in train_data:
                x_batch = padding.append(x_batch, ignore_index=True)
コード例 #10
0
                })
            print('Validation-time loss = {}'.format(test_loss))
            if save_model:
                saver.save(sess, save_path)
                print('Saved trained model to disk!')
        # optional plot
        print('Plotting predictions...')
        plt.figure(figsize=(12, 7))
        plt.plot(test_predictions[:, -1, :],
                 color='green',
                 alpha=0.9,
                 label='Predicted Value')
        plt.plot(y_test, color='red', alpha=0.5, label='Ground truth')
        ax = plt.gca()
        ax.grid(color='black', alpha=0.12)
        plt.xlabel('Timestep')
        plt.ylabel('Value')
        plt.title('Predictions vs. Ground truth')
        plt.legend()
        plt.show()
        print('Process complete!')


if __name__ == '__main__':
    helper = DataHelper('../data/PFE.csv')
    tf.reset_default_graph()
    predictor = PricePredictor(learning_rate=0.0001)
    predictor.run(helper,
                  100,
                  save_model=True,
                  save_path='../model/model.ckpt')
コード例 #11
0
ファイル: test.py プロジェクト: xiholix/learnTensorFlow
labels = np.array(labels)
data = np.concatenate((pos, neg), axis=0)
# shuffle data
indice = np.random.permutation(length)
data = data[indice]
labels = labels[indice]
data_length = len(data)
dev_indice = int(0.1 * data_length)
print dev_indice

train_data = data[dev_indice:]
train_labels = labels[dev_indice:]
dev_data = data[:dev_indice]
dev_labels = labels[:dev_indice]

d = DataHelper(50, len(train_data), train_data, word_vec, word_indice,
               train_labels)
dev_d = DataHelper(50, len(dev_data), dev_data, word_vec, word_indice,
                   dev_labels)

cnn = CNNNet(4, [3, 4, 5], 50, 300, 56, 2)
optm = tf.train.AdamOptimizer(1e-3)
train_ops = optm.apply_gradients(optm.compute_gradients(cnn.loss))
tf.summary.scalar('accuracy', cnn.accuracy)
tf.summary.scalar('loss', cnn.loss)
tf.summary.histogram('full_w', cnn.full_w)
summary = tf.merge_all_summaries()
sess = tf.Session()
summaryWriter = tf.train.SummaryWriter('./log/summary/', sess.graph)

sess.run(tf.initialize_all_variables())
コード例 #12
0
    def train(self, train_data_path, test_data, options):

        validFreq = options['validFreq']
        saveFreq = options['saveFreq']
        dispFreq = options['dispFreq']
        max_iter = options['max_iter']
        saveto =options['saveto']

        train_loss_his = []
        test_loss_his = []

        start_time = time.time()

        #test_loss_ = self.test_loss(self._test, test_data, options)
        # test_loss_his.append(test_loss_)
        # print 'Valid cost:', test_loss_

        train_loss = 0.
        records_file = open(options['record_path'],'w+')
        file_name = options['train_data_file_path'] + 'fcv_train_feats.h5'
        train_data = DataHelper.DataHelper(options['v_length'], options['batch_size'],
                                           options['dim_frame'],
                                           data_file=file_name, train=True)
        H = np.zeros([train_data.data_size_, options['dim_proj']],dtype=np.float32)

        try:
            for uidx in xrange(1,max_iter+1):
                #get splits of an epoch
                for eidx in xrange(1,options['train_splits_num']+1):
                    #for YFCC
                    #file_name = options['train_data_file_path']+'yfcc_train_feats_'+str(eidx)+'.h5'
                    #for FCV
                    file_name = options['train_data_file_path'] + 'fcv_train_feats.h5'

                    train_data = DataHelper.DataHelper(options['v_length'], options['batch_size'],
                                                       options['dim_frame'],
                                                        data_file= file_name, train=True)
                    print 'loading data:'+file_name
                    #get the batch train data

                    m = train_data.data_size_/train_data.batch_size_
                    if  train_data.data_size_%train_data.batch_size_ == 0:
                        m = m
                    else:
                        m += 1
                    print 'm: ',m
                    for i in range(0,m):
                        #if i % 10 ==0:
                            #print i
                        if i == (m-1):
                            x = indexContent(train_data,train_data.idx_[i*options['batch_size']:])
                            idxs = train_data.idx_[i*options['batch_size']:]

                        else:
                            x = indexContent(train_data,train_data.idx_[i*options['batch_size']:(i+1)*options['batch_size']])
                            idxs = train_data.idx_[i*options['batch_size']:(i+1)*options['batch_size']]

                        [H, train_loss, loss_pairwise,reconstruction_loss] = self._train(
                            x,idxs,H,
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32),
                            np.zeros((x.shape[0], options['dim_proj']), dtype=np.float32))
                        if i % 10 == 0:
                            print 'Epoch: ',uidx,'\tPart: ',eidx,'\tBatch: ',i,'\tCost: ',train_loss,'\tpairwise_loss: ',loss_pairwise,'\trestruction_loss: ',reconstruction_loss
                            records_file.write('Epoch: '+str(uidx)+'\tPart: '+str(eidx)+'\tBatch: '+str(i)+'\tCost: '+str(train_loss)+'\tpairwise_loss: '+str(loss_pairwise)+'\trestruction_loss'+str(reconstruction_loss)+'\n')

                if uidx%options['validFreq'] == 0:
                    print 'start testing...'
                    maps = evaluation.test(self._encoder,options,uidx)
                if np.isnan(train_loss) or np.isinf(train_loss):
                    print 'bad cost detected: ', train_loss

                if np.mod(uidx, dispFreq) == 0 or uidx is 1:
                    train_loss = train_loss/(x.shape[0]*x.shape[1])
                    train_loss_his.append(train_loss)
                    print 'Step ', uidx,  'Train cost:', train_loss

                if saveto and np.mod(uidx, saveFreq) == 0:
                    print 'Saving...',
                    params_to_save = self.get_params_value()
                    updates_value = self.get_updates_value()
                    np.savez(saveto, params=params_to_save, updates_v=updates_value,
                             train_loss_his=train_loss_his)
                    pkl.dump(options, open('%s.pkl' % saveto, 'wb'), -1)
                    print 'Save Done'


        except KeyboardInterrupt:
            print "Training interupted"
            print 'Saving records!'
            records_file.close()

        if saveto:
            print 'Saving...',
            params_to_save = self.get_params_value()
            updates_value = self.get_updates_value()
            np.savez(saveto, params=params_to_save, updates_v=updates_value,
                     train_loss_his=train_loss_his, test_loss_his=test_loss_his)
            pkl.dump(options, open('%s.pkl' % saveto, 'wb'), -1)
            print 'Save Done'

        end_time = time.time()
        print  ('Training took %.1fs' % (end_time - start_time))