Ejemplo n.º 1
0
def MSE(list1, list2):
    len_list1 = len(list1)
    len_list2 = len(list2)
    if not len_list1 == len_list2:
        return 0
    sum = 0
    for i in range(len_list1):
        sum += (list1[i] - list2[i])**2
    result = math.sqrt(sum / len_list1)
    return result


if __name__ == '__main__':
    label_trans = 3
    fuse_list, id_list, label_list, len_digit = load_train_data(
        file='../data/train.csv', label_trans=label_trans)
    len_sample = len(fuse_list)
    LOL = cross_validation.KFold(len_sample, n_folds=2, shuffle=False)
    #x_train,x_test,y_train,y_test=train_test_split(fuse_list,label_list,test_size=0.2)

    mse_ = []
    mse_1 = []
    mse_2 = []
    for train_index, test_index in LOL:
        x_train, x_test = fuse_list[train_index], fuse_list[test_index]
        y_train, y_test = label_list[train_index], label_list[test_index]
        xg_train = xgb.DMatrix(x_train, label=y_train)
        xg_test = xgb.DMatrix(x_test, label=y_test)

        watchlist = [(xg_train, 'train'), (xg_test, 'test')]
        num_round = 2000
Ejemplo n.º 2
0
def serialize_data(plain_data_tuple, json_file):
    """
    Приготовление данных из xml долгий процесс, поэтому заводитятся json для готовых данных
    """
    twits, emotions, weight = utils.load_train_data(plain_data_tuple['file'], plain_data_tuple['n'])
    make_json({"twits": twits, "emotions": emotions, "weight": weight}, json_file)
Ejemplo n.º 3
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', '--training-file')
    parser.add_argument('--hr',
                        type=int,
                        default=9,
                        help='hours taken to predict. (default: %(default)s)')
    args = parser.parse_args()

    train_file = args.training_file
    lr = 1e-3
    hr = args.hr

    trainX, trainY, mean, std = utils.load_train_data(train_file,
                                                      hr,
                                                      preprocessing=False)
    trainX, validX, trainY, validY = utils.train_test_split(
        trainX, trainY, 0.1)
    print(f'\033[32;1mtrainX: {trainX.shape}, trainY: {trainY.shape}\033[0m')

    optimizer = Adam(compute_gradient, lr)

    XTX = trainX.T @ trainX
    XTY = trainX.T @ trainY
    epochs = 40000
    w = np.random.normal(0, 0.05, size=(trainX.shape[1], 1)).astype(np.float32)
    for epoch in range(epochs):
        optimizer.update(XTX, XTY, w)
        if epoch % 100 == 0:
            print(
Ejemplo n.º 4
0
import tflearn
import numpy as np

from utils import load_train_data
from model_builder import vgg16_2

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# VGG 16
MODEL_NAME = "VGG16_basic_128x128"
NUM_CLASS = 2

# Load Dataset
train_data = load_train_data(name="train_data_{}x{}.npy",
                             resize_pics=(128, 128))

# Dataset reshaping
X, Y = train_data[:, 0], train_data[:, 1]
X = np.array([i for i in X])
Y = np.array([i for i in Y])
logging.debug("{}, {}".format(X.shape, Y.shape))

# Training
model = tflearn.DNN(vgg16_2(NUM_CLASS),
                    checkpoint_path='checkpoint/model_{}'.format(MODEL_NAME),
                    max_checkpoints=1,
                    tensorboard_verbose=0,
                    tensorboard_dir="log")
logging.debug("Model building finish")
Ejemplo n.º 5
0
    def run_model(self):
        # os.environ["CUDA_VISIBLE_DEVICES"] = "2"
        # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6) # allocate certain gpu memory
        # config = tf.ConfigProto(gpu_options=gpu_options)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        total_batch = int(self.num_users / self.batch_size) + 1
        u_mean, u_cov, i_mean, i_cov = self.build_model()
        init = tf.global_variables_initializer()
        with tf.Session(config=config) as sess:
            sess.run(init)  # initialize all the variables
            # load weights
            epoch_pre = 999999999
            for epoch in range(self.num_epoch):
                epoch_cost = 0
                for itr in range(total_batch):
                    train_tuple = load_train_data(self.train_content,
                                                  self.train_path,
                                                  self.num_users,
                                                  self.num_items,
                                                  self.batch_size)
                    for jj in range(self.batch_size):
                        cost, _ = sess.run(
                            [self.loss, self.train_op],
                            feed_dict={
                                self.user_idx:
                                train_tuple[jj, 0],
                                self.item_pos_idx:
                                train_tuple[jj, 1],
                                self.item_neg_idx:
                                train_tuple[jj, 2],
                                self.user_attention_items_idx:
                                self.train_set[train_tuple[jj, 0]]
                                # self.content_pos: self.content_data[train_tuple[:, 1]],
                                # self.content_neg: self.content_data[train_tuple[:, 2]]
                            })

                        epoch_cost = epoch_cost + cost
                '''
				for u in range(self.num_users):
					time1 = time.time()
					for i in range(self.num_items):
						self.R[u, i] = self.cal_was_distance(self.user_mean_distribution[u], self.item_mean_distribution[i], 
													self.user_cov_distribution[u], self.item_cov_distribution[i])
					time2 = time.time()
					break
				'''

                # time1 = time.time()
                # self.R[0, 0] = self.cal_was_distance(self.user_mean_distribution[0], self.item_mean_distribution[0],
                # 									self.user_cov_distribution[0], self.item_cov_distribution[0])
                # time2 = time.time()
                print('##########################')
                print('Epoch ' + str(epoch + 1))
                print('##############')
                print('The total cost is: ' + str(epoch_cost))
                # print('It took %.3f seconds to train an epoch'%(time2-time1))
                print('\n')

                # if epoch_pre > epoch_cost:
                # 	epoch_pre = epoch_cost
                # else:
                # 	break
            '''Testing'''
            '''Fix some bugs, test_set'''
            item_test_set = list(self.test_set.keys())
            num_item_test = len(item_test_set)
            HR = np.array([0.0] * 5, dtype=np.float)
            NDCG = np.array([0.0] * 5, dtype=np.float)
            for user in range(self.num_users):
                pos_item_test = self.test_set[user]
                negative_item_set = list(
                    set(range(self.num_items)) - set(self.train_set[user]))
                if user in self.valid_set:
                    negative_item_set = list(
                        set(negative_item_set) - set(self.valid_set[user]))
                if user in self.test_set:
                    negative_item_set = list(
                        set(negative_item_set) - set(self.test_set[user]))
                neg_item_test = np.random.choice(negative_item_set,
                                                 99,
                                                 replace=False)
                # item_to_test = np.append(pos_item_test, neg_item_test)
                item_to_test = np.append(neg_item_test, pos_item_test)
                users = [user] * 100
                ratings = []
                for kk in range(100):
                    value = sess.run(self.distances,
                                     feed_dict={
                                         self.user_idx:
                                         users[kk],
                                         self.item_pos_idx:
                                         item_to_test[kk],
                                         self.user_attention_items_idx:
                                         self.train_set[users[kk]]
                                     })
                    ratings.append(value)
                item_score = [(item_to_test[i], ratings[i])
                              for i in range(len(item_to_test))]

                item_score = sorted(item_score, key=lambda x: x[1])

                item_sort = [pair[0] for pair in item_score]
                '''ouput user 0's ratings for checking '''
                # if user == 0:
                # 	rating_sort = [pair[1] for pair in item_score]
                # 	with open('CHECKING_cold_attention.txt', 'a') as f:
                # 		for rating in  rating_sort:
                # 			f.write(str(rating)+'\n')

                r = []

                for i in item_sort:
                    if i in pos_item_test:
                        r.append(1)
                    else:
                        r.append(0)

                hr_1 = self.hr_at_k(r, 1)
                hr_3 = self.hr_at_k(r, 3)
                hr_5 = self.hr_at_k(r, 5)
                hr_7 = self.hr_at_k(r, 7)
                hr_10 = self.hr_at_k(r, 10)
                ndcg_1 = self.ndcg_at_k(r, 1)
                ndcg_3 = self.ndcg_at_k(r, 3)
                ndcg_5 = self.ndcg_at_k(r, 5)
                ndcg_7 = self.ndcg_at_k(r, 7)
                ndcg_10 = self.ndcg_at_k(r, 10)
                HR = HR + np.array([hr_1, hr_3, hr_5, hr_7, hr_10],
                                   dtype=np.float)
                NDCG = NDCG + np.array(
                    [ndcg_1, ndcg_3, ndcg_5, ndcg_7, ndcg_10], dtype=np.float)

            HR = HR / num_item_test
            NDCG = NDCG / num_item_test

            print('HR: ' + str(HR))
            print('NDCG: ' + str(NDCG))
            # print(type(HR[0]))

            with open(
                    'results/100_' + self.dataset_name +
                    '_Record_HR_cold_%s_attention.txt' % (self.sparsity),
                    'a') as f:
                f.write(
                    'Testing cold Result(dim=%d)(%d epochs): hr@1:%f  hr@3:%f  hr@5:%f  hr@7:%f  hr@10:%f\n'
                    % (self.n_z,
                       (epoch + 1), HR[0], HR[1], HR[2], HR[3], HR[4]))
                # f.write(str(HR)+'\n')
            with open(
                    'results/100_' + self.dataset_name +
                    '_Record_NDCG_cold_%s_attention.txt' % (self.sparsity),
                    'a') as f:
                f.write(
                    'Testing cold Result(dim=%d)(%d epochs): ndcg@1:%f ndcg@3:%f ndcg@5:%f ndcg@7:%f ndcg@10:%f\n'
                    %
                    (self.n_z,
                     (epoch + 1), NDCG[0], NDCG[1], NDCG[2], NDCG[3], NDCG[4]))
            '''
Ejemplo n.º 6
0
def save_random_paragraphs():
    X, y = load_train_data()
    output = open('example_test.txt', 'wb')
    for x in X[:100]:
        output.write(x)
        output.write(PARAGRAPH)
Ejemplo n.º 7
0
    parser.add_argument('-g', '--gpu', default='5')
    args = parser.parse_args()

    data_dir = args.data_dir
    model_path = args.model_path
    output_dir = args.output_dir
    function = args.model_function
    input_shape = (128, 128)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    trainX, trainY = utils.load_train_data(data_dir, input_shape, normalize=False, preprocessing=True)
    print(f'\033[32;1mtrainX: {trainX.shape}, trainY: {trainY.shape}\033[0m')

    if function not in globals():
        globals()[function] = getattr(importlib.import_module(function[:function.rfind('.')]), function.split('.')[-1])
    model = globals()[function](input_shape + (3,), 11)
    model.compile(Adam(1e-3), loss='categorical_crossentropy', metrics=['acc'])
    model.summary()

    print('\033[32;1mLoading Model\033[0m')
    model.load_weights(model_path)

    os.makedirs(output_dir, exist_ok=True)
    os.chdir(output_dir)
    idx = [83, 4218, 4707, 8598]
    images, labels = trainX[idx], trainY[idx]
Ejemplo n.º 8
0
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
vocab, embeddings = utils.load_embeddings()
train_data = utils.load_train_data(vocab, FLAGS.sequence_length)
test_data = utils.load_test_data(vocab, FLAGS.sequence_length)
print("Load done...")

# Training
# ==================================================

prev_auc = 0
with tf.Graph().as_default():
  with tf.device("/gpu:1"):
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = InsQACNN(
from keras.preprocessing.sequence import pad_sequences

from network import create_model, find_checkpoint_file, run_test
from utils import preprocess_data, load_train_data, load_from_file
from config import short_conf as conf

# Loading input sequences, output sequences
print('Loading data...')
x, = load_train_data(
    file_source_lang='small_vocab_en',
    file_target_lang='small_vocab_fr',
    load_method=load_from_file,
)

# create mappings
x_sent_array, x_vocab_len, x_word_to_idx, x_idx_to_word = preprocess_data(
    x, conf['MAX_LEN'], conf['VOCAB_SIZE'], num_seq=5000)
y_sent_array, y_vocab_len, y_word_to_idx, y_idx_to_word = preprocess_data(
    x, conf['MAX_LEN'], conf['VOCAB_SIZE'], num_seq=5000)

# Find the length of the longest sequence
x_max_len = max([len(sentence) for sentence in x_sent_array])
y_max_len = max([len(sentence) for sentence in y_sent_array])

# Padding zeros to make all sequences have a same length with the longest one
print('Zero padding...')
X = pad_sequences(x_sent_array, maxlen=x_max_len, dtype='int32')
y = pad_sequences(y_sent_array, maxlen=y_max_len, dtype='int32')

# Creating the network model
print('Compiling model...')
Ejemplo n.º 10
0
RUN = 'run_2'
TRAIN_DIR = '/media/data_raid/nikhil/encoder_summary/'

tf.app.flags.DEFINE_string(
    'train_dir', TRAIN_DIR + RUN,
    """Directory where we write logs and checkpoints""")
tf.app.flags.DEFINE_string('checkpoint_dir', TRAIN_DIR + RUN,
                           """Directory from where to read the checkpoint""")

NUMBER_OF_CAPSULES = 60
RECOGNISE_DIMEN = 10
GENERATE_DIMEN = 20
IN_DIMEN = 28 * 28

if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description='Transforming Autoencoder Tensorflow implementation')
    #parser.add_argument('-chk', '--resume_checkpoint')
    args = parser.parse_args()
    print args

    train_images = load_train_data()
    X_trans, trans, X_original = translate(train_images)

    model = Model_Train(X_trans, trans, X_original, NUMBER_OF_CAPSULES,
                        RECOGNISE_DIMEN, GENERATE_DIMEN, IN_DIMEN)
    model.train()

    sys.exit(0)
Ejemplo n.º 11
0
        return np.array(embedding, np.float32)

    def get_word2idx(self):
        word2idx = {w: i for i, w in enumerate(self.model.wv.vocab)}
        word2idx['<PAD>'] = len(word2idx)
        word2idx['<UNK>'] = len(word2idx)
        return word2idx


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('model_path')
    parser.add_argument('train_data_with_label')
    parser.add_argument('train_data_without_label')
    parser.add_argument('test_data')
    args = parser.parse_args()

    trainX, _ = utils.load_train_data(args.train_data_with_label,
                                      preprocessing=False)
    trainX_no_label = utils.load_train_data(args.train_data_without_label,
                                            label=False,
                                            preprocessing=False)

    print("loading testing data ...")
    testX = utils.load_test_data(args.test_data, preprocessing=False)

    model = Word2Vec(300).fit(trainX + trainX_no_label + testX)

    print("saving model ...")
    model.save(args.model_path)
Ejemplo n.º 12
0
                        '--training-file',
                        nargs=2,
                        help='trainX and trainY')
    parser.add_argument('-T', '--no-training', action='store_true')
    parser.add_argument('-s',
                        '--test',
                        nargs=2,
                        help='testing file and the predicted file')
    args = parser.parse_args()

    model_path = args.model_path
    train_file = args.training_file
    training = not args.no_training
    test = args.test

    trainX, trainY, mean, std = utils.load_train_data(train_file[0],
                                                      train_file[1])
    trainY = (trainY * 2 - 1).astype(np.int32).ravel()
    trainX, validX, trainY, validY = utils.train_test_split(trainX, trainY)
    print(
        f'\033[32;1mtrainX: {trainX.shape}, trainY: {trainY.shape}, validX: {validX.shape}, validY: {validY.shape}\033[0m'
    )

    if training:
        T = 32
        clf = RandomForest(T).fit(trainX, trainY, max_height=9)
        utils.save_model(model_path, clf)
    else:
        clf = utils.load_model(model_path)

    if test:
        testX = utils.load_test_data(test[0], mean, std)
Ejemplo n.º 13
0
import sys, os, random
from MulticoreTSNE import MulticoreTSNE as TSNE
import matplotlib.pyplot as plt
import pickle, os, sys

import utils


def TSNE_and_draw(vec):
    tsne = TSNE(n_jobs=os.cpu_count(), perplexity=100)
    embedding = tsne.fit_transform(vec)
    return embedding


def plot_result(embedding, Y, file_path):
    for c in np.unique(Y):
        plt.scatter(embedding[Y == c, 0],
                    embedding[Y == c, 1],
                    s=0.5,
                    label=str(c))
    plt.legend(fancybox=True)
    plt.savefig(file_path)


if __name__ == '__main__':
    X, Y = utils.load_train_data(sys.argv[1])

    embedding = TSNE_and_draw(X)  # place accurate in the beginning
    print('finishing TSNE')
    plot_result(embedding, Y, 'tsne.jpg')
Ejemplo n.º 14
0
checkpoint = ModelCheckpoint(
    'output/icnet_' + model_type +
    '_{epoch:03d}_{conv6_cls_categorical_accuracy:.3f}.h5',
    mode='max')
tensorboard = TensorBoard(
    batch_size=batch_size,
    log_dir="./logs/ICNet/" + model_type +
    "/{}/".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())))
lr_decay = LearningRateScheduler(PolyDecay(0.01, 0.9, epochs).scheduler)

# ==========
# Generators
# ==========
if model_type == "early_fusion":
    train_generator = utils.early_fusion_generator(
        df=utils.load_train_data(configs.label_depth_color_path),
        batch_size=batch_size,
        resize_shape=(configs.img_width, configs.img_height),
        crop_shape=(configs.img_width, configs.img_height),
        n_classes=34,
        training=True)

    val_generator = utils.early_fusion_generator(
        df=utils.load_val_data(configs.val_depth_color_path),
        batch_size=1,
        resize_shape=(configs.img_width, configs.img_height),
        crop_shape=(configs.img_width, configs.img_height),
        n_classes=34,
        training=False)
elif "cross_fusion" in model_type:
    train_generator = utils.fusion_generator(
Ejemplo n.º 15
0
model_type = "large_full"

#### Train ####

# Callbacks
checkpoint = ModelCheckpoint('output/icnet_' + model_type +
                             '_{epoch:03d}_{categorical_accuracy:.3f}.h5',
                             mode='max')
tensorboard = TensorBoard(
    batch_size=batch_size,
    log_dir="./logs/ICNet/" + model_type +
    "/{}/".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())))
lr_decay = LearningRateScheduler(PolyDecay(0.01, 0.9, epochs).scheduler)

# Generators
train_generator = utils.generator(df=utils.load_train_data(),
                                  batch_size=batch_size,
                                  resize_shape=(configs.img_width,
                                                configs.img_height),
                                  crop_shape=(configs.img_width,
                                              configs.img_height),
                                  n_classes=34,
                                  training=True)

val_generator = utils.generator(df=utils.load_val_data(configs.val_label_path),
                                batch_size=1,
                                resize_shape=(configs.img_width,
                                              configs.img_height),
                                crop_shape=(configs.img_width,
                                            configs.img_height),
                                n_classes=34,
Ejemplo n.º 16
0
import numpy as np
import os
from utils import check_DNA, load_train_data, load_test_data, get_gram, kaggle_submit
from kernels import GaussianKernel, SpectrumKernel, MismatchKernel
from preprocessing import center_gram, scale_gram
from grid_search import CvSearch_Spectrum, CvSearch_Sum, get_best_sum
from sum_kernels import compute_gram
from KRR import KRR
from KSVM import KernelSVM

# loading datasets
data_folder = 'data/'
experiment_path = './submissions'

# load string data
X_train, Y_train = load_train_data(data_folder, 'raw')
X_test = load_test_data(data_folder, 'raw')

# map the target vectors to do binary classification
Y_train_ = 2 * Y_train - 1

##### DATASET 0
kernels = [['mismatch', ii] for ii in [7, 8]]
lambd_0 = 1.4
K_train_0, K_test_0 = get_gram(kernels=kernels,
                               dataset_idx=0,
                               X_train=X_train[0],
                               X_test=X_test[0],
                               scale=False,
                               center=False,
                               sum_gram=True)
Ejemplo n.º 17
0
    parser.add_argument('model_path')
    parser.add_argument('-t', '--training-file')
    parser.add_argument('-T', '--no-training', action='store_true')
    parser.add_argument('-s',
                        '--test',
                        nargs=2,
                        help='testing file and the predicted file')
    args = parser.parse_args()

    model_path = args.model_path
    train_file = args.training_file
    training = not args.no_training
    test = args.test

    if training:
        trainX, trainY, mean, std = utils.load_train_data(train_file, 9)
        trainX, trainY = np.matrix(trainX), np.matrix(trainY)
        print(
            f'\033[32;1mtrainX: {trainX.shape}, trainY: {trainY.shape}\033[0m')
        np.save('mean_best.npy', mean)
        np.save('std_best.npy', std)

        w = (trainX.T * trainX).I * (trainX.T * trainY)
        #a = np.array(w)[1:].reshape(-1, 9)
        #for i in a:
        #    print(('%.3f '*9) % tuple(i))
        #print(w.shape)
        np.save(model_path, w)
    else:
        w = np.load(model_path)
        mean, std = np.load('mean_best.npy'), np.load('std_best.npy')
Ejemplo n.º 18
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
vocab, embeddings = utils.load_embeddings()
train_data = utils.load_train_data(vocab, FLAGS.sequence_length)
test_data = utils.load_test_data(vocab, FLAGS.sequence_length)
print("Load done...")

# Training
# ==================================================

prev_auc = 0
with tf.Graph().as_default():
    with tf.device("/gpu:1"):
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = InsQACNN(_margin=FLAGS.margin,
Ejemplo n.º 19
0
import numpy as np

from utils import load_train_data
from model_builder import vgg16_2

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# VGG 16
MODEL_NAME = "VGG16_basic_128x128"
NUM_CLASS = 2

# Load Dataset
train_data = load_train_data(
    name="train_data_{}x{}.npy",
    resize_pics=(128, 128))

# Dataset reshaping
X, Y = train_data[:, 0], train_data[:, 1]
X = np.array([i for i in X])
Y = np.array([i for i in Y])
logging.debug("{}, {}".format(X.shape, Y.shape))

# Training
model = tflearn.DNN(
    vgg16_2(NUM_CLASS),
    checkpoint_path='checkpoint/model_{}'.format(MODEL_NAME),
    max_checkpoints=1,
    tensorboard_verbose=0,
    tensorboard_dir="log")
Ejemplo n.º 20
0
    parser.add_argument('--hidden_ch', default=6, type=int)
    parser.add_argument('--dropout', default=False, action='store_true')
    parser.add_argument('--lr', default=0.001, type=float)
    parser.add_argument('--gamma', default=0.9, type=float)
    parser.add_argument('--max_epoch', default=300, type=int)
    parser.add_argument('--normalize', default=False, action='store_true')
    parser.add_argument('--alldata', default=False, action='store_true')
    parser.add_argument('--small', default=False, action='store_true')
    parser.add_argument('--fix_lr', default=False, action='store_true')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse()

    train_ids, train_data, train_labels, med = load_train_data(
        'data/train.csv', small=args.small)
    print(train_data.shape)

    if args.normalize:
        train_mean = np.mean(train_data, axis=0)
        train_std = np.std(train_data, axis=0)
        train_data = (train_data - train_mean) / train_std

    if not args.alldata:
        val_data = train_data[-100:]
        val_labels = train_labels[-100:]
        train_data = train_data[:-100]
        train_labels = train_labels[:-100]

    model = TitanicModel(hidden_ch=args.hidden_ch, is_dropout=args.dropout)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=args.lr),
Ejemplo n.º 21
0
                val_acc += acc

            val_result = f'{self.mode} {epoch}, loss:{val_loss / (j + 1)}, acc:{val_acc / (j + 1)}\n'
            print(val_result)
            self.save_result(file, val_result)
            val_loss_lst.append(val_loss / (j + 1))
            val_acc_lst.append(val_acc / (j + 1))

        return loss_lst, acc_lst, val_loss_lst, val_acc_lst


if __name__ == '__main__':
    import utils
    from model import Linear

    meta_file = 'batches.meta'

    label_names = utils.get_label_names(meta_file)
    train_images, train_labels = utils.load_train_data(200)

    val_images, val_labels = train_images[-1], train_labels[-1]
    val_images = val_images.reshape(1, val_images.shape[0],
                                    val_images.shape[1], val_images.shape[2],
                                    -1)
    val_labels = val_labels.reshape(1, val_labels.shape[0], -1)
    train_images, train_labels = train_images[:-1], train_labels[:-1]

    linear_rs = LinearRS(3072, 10, 10)

    linear_rs.train(train_images, train_labels, val_images, val_labels)
Ejemplo n.º 22
0
def train_cnn(args):
  # initialize wandb logging to your project
  wandb.init()
  config = {
    "model_type" : "cnn",
    "batch_size" : BATCH_SIZE,
    "num_classes" : NUM_CLASSES,
    "epochs" : EPOCHS,
    "filters_init": FILTERS_INIT,
    "dropout": DROPOUT,
    "blocks": BLOCKS,
    "conv_per_block": CONV_PER_BLOCK,
    "fc_size": FC_SIZE,
    "learning_rate": LEARNING_RATE,
    "subtract_pixel_mean": SUBTRACT_PIXEL_MEAN
  }
  wandb.config.update(config)

  pprint(config)

  # Load the data form the relative path provided
  x_train, y_train = load_train_data(args.data_home)
  x_test, y_test = load_test_data(args.data_home)

  # reshape to channels last
  x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
  x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
  input_shape = (img_rows, img_cols, 1)

  x_train = x_train.astype('float32')
  x_test = x_test.astype('float32')
  x_train /= 255
  x_test /= 255

  # If subtract pixel mean is enabled
  if SUBTRACT_PIXEL_MEAN:
    x_train_mean = np.mean(x_train, axis=0)
    x_train -= x_train_mean
    x_test -= x_train_mean
 
  # Data augmentation
  datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1)
  datagen.fit(x_train)
 
  N_TRAIN = len(x_train)
  N_TEST = len(x_test)
  wandb.config.update({"n_train" : N_TRAIN, "n_test" : N_TEST})
  print('{} train samples, {} test samples'.format(N_TRAIN, N_TEST))

  # Convert class vectors to binary class matrices
  y_train = tf.keras.utils.to_categorical(y_train, NUM_CLASSES)
  y_test = tf.keras.utils.to_categorical(y_test, NUM_CLASSES)

  # Build model (inspired from https://keras.io/examples/cifar10_resnet/)
  model = tf.keras.Sequential()
  filters = FILTERS_INIT
  model.add(layers.Conv2D(filters, kernel_size=(3, 3),
            input_shape=input_shape, padding="same"))
  model.add(layers.BatchNormalization())
  model.add(layers.Activation('relu'))
  model.add(layers.Dropout(DROPOUT / 2))
  for _ in range(CONV_PER_BLOCK - 1):
    model.add(layers.Conv2D(filters, kernel_size=(3, 3), padding="same"))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))
    model.add(layers.Dropout(DROPOUT / 2))
  for _ in range(BLOCKS - 1):
    filters *= 2
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    for _ in range(CONV_PER_BLOCK):
      model.add(layers.Conv2D(filters, kernel_size=(3, 3), padding="same"))
      model.add(layers.BatchNormalization())
      model.add(layers.Activation('relu'))
      model.add(layers.Dropout(DROPOUT / 2))
  model.add(layers.GlobalAveragePooling2D())
  model.add(layers.Dropout(DROPOUT))
  model.add(layers.Dense(NUM_CLASSES, activation='softmax'))

  model.compile(loss="categorical_crossentropy",
              optimizer=keras.optimizers.Adam(lr=LEARNING_RATE),
              metrics=['accuracy'])

  model.summary()

  model.fit_generator(datagen.flow(x_train, y_train, batch_size=BATCH_SIZE),
                      epochs=EPOCHS,
                      verbose=1,
                      validation_data=(x_test, y_test),
                      callbacks=[KmnistCallback(), EarlyStopping(patience=5), ReduceLROnPlateau(),
                                 WandbCallback(data_type="image", labels=LABELS_10)])

  train_score = model.evaluate(x_train, y_train, verbose=0)
  test_score = model.evaluate(x_test, y_test, verbose=0)
  print('Train loss:', train_score[0])
  print('Train accuracy:', train_score[1])
  print('Test loss:', test_score[0])
  print('Test accuracy:', test_score[1])
Ejemplo n.º 23
0
    def train(self, args):
        """Train SG-GAN"""

        lr = 0.001  # self.lr = 0.001
        self.d_optim = tf.keras.optimizers.Adam(learning_rate=lr,
                                                beta_1=args.beta1)
        self.g_optim = tf.keras.optimizers.Adam(learning_rate=lr,
                                                beta_1=args.beta1)
        start_time = time.time()

        if args.continue_train:
            print(" [*] Loading pretrained weights ...")
            if self.load(args.checkpoint_dir):
                print(" [*] Load SUCCESS")
            else:
                print(" [!] Load failed...")
        else:
            print(" [*] New training STARTED")
        try:
            for epoch in range(args.epoch):
                dataA = glob(
                    './datasets/{}/*.*'.format(args.dataset_dir + '/trainA')
                )  # glob('./datasets/{}/*.*'.format(self.dataset_dir + '/trainA'))
                np.random.shuffle(dataA)
                batch_idxs = min(
                    len(dataA),
                    args.train_size) // args.batch_size  # self.batch_size
                # lr = args.lr if epoch < args.epoch_step else args.lr*(args.epoch-epoch)/(args.epoch-args.epoch_step)

                augmenter = DataAugmentation()

                for idx in range(0, batch_idxs):
                    batch_files = list(
                        zip(dataA[idx * args.batch_size:(idx + 1) *
                                  args.batch_size]))

                    batch_images = []
                    batch_segs = []
                    batch_seg_mask_A = []

                    for batch_file in batch_files:
                        tmp_image, tmp_seg, tmp_seg_mask_A = load_train_data(
                            batch_file,
                            args.image_width,
                            args.image_height,
                            num_seg_masks=args.segment_class,
                            do_augment=False,
                            augmenter=augmenter
                        )  # num_seg_masks=self.segment_class)
                        batch_images.append(tmp_image)
                        batch_segs.append(tmp_seg)
                        batch_seg_mask_A.append(tmp_seg_mask_A)

                        if (args.use_augmentation):
                            tmp_image, tmp_seg, tmp_seg_mask_A = load_train_data(
                                batch_file,
                                args.image_width,
                                args.image_height,
                                num_seg_masks=args.segment_class,
                                do_augment=True,
                                augmenter=augmenter
                            )  # num_seg_masks=self.segment_class)
                            batch_images.append(tmp_image)
                            batch_segs.append(tmp_seg)
                            batch_seg_mask_A.append(tmp_seg_mask_A)

                    batch_images = np.array(batch_images).astype(np.float32)
                    batch_segs = np.array(batch_segs).astype(np.float32)
                    batch_seg_mask_A = np.array(batch_seg_mask_A).astype(
                        np.float32)

                    self.real_data = batch_images
                    self.seg_data = batch_segs

                    self.real_A = self.real_data[:, :, :, :args.input_nc]
                    self.seg_A = self.seg_data[:, :, :, :args.input_nc]

                    self.mask_A = batch_seg_mask_A

                    self.train_step(args)

                    print((
                        "Epoch: [%2d] [%4d/%4d] time: %4.4f Gen_Loss: %f Disc_Loss: %f "
                        % (epoch, idx, batch_idxs, time.time() - start_time,
                           self.gen_loss, self.disc_loss)))

                with train_summary_writer.as_default():
                    fake = self.test_during_train(epoch, args)
                    tf.summary.image('Segmentation Epoch {}'.format(epoch),
                                     fake,
                                     step=epoch)

                    tf.summary.scalar('Generator Loss',
                                      generator_loss_metric.result(),
                                      step=epoch)
                    tf.summary.scalar('Discriminator Loss',
                                      discriminator_loss_metric.result(),
                                      step=epoch)

                generator_loss_metric.reset_states()
                discriminator_loss_metric.reset_states()
        except KeyboardInterrupt:
            self.save(args.checkpoint_dir, epoch)
        finally:
            self.save(args.checkpoint_dir, epoch)
Ejemplo n.º 24
0
    w2v_model = Word2Vec().load(word2vec_model_path)
    word2idx = w2v_model.get_word2idx()
    embedding = w2v_model.get_embedding()
    vocabulary_size = len(word2idx)
    print(f'\033[32;1mvocabulary_size: {vocabulary_size}\033[0m')

    if function not in globals():
        globals()[function] = getattr(
            importlib.import_module(function[:function.rfind('.')]),
            function.split('.')[-1])
    model = globals()[function](embedding)
    model.compile(Adam(lr), loss='binary_crossentropy', metrics=['acc'])
    model.summary()

    if training:
        trainX, trainY = utils.load_train_data(labeled_path, word2idx,
                                               max_seq_len)
        trainX, validX, trainY, validY = utils.train_test_split(
            trainX, trainY, split_ratio=0.1)
        print(
            f'\033[32;1mtrainX: {trainX.shape}, validX: {validX.shape}, trainY: {trainY.shape}, validY: {validY.shape}\033[0m'
        )
        checkpoint = ModelCheckpoint(model_path,
                                     'val_loss',
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True)
        reduce_lr = ReduceLROnPlateau('val_loss',
                                      0.8,
                                      2,
                                      verbose=1,
                                      min_lr=1e-5)
Ejemplo n.º 25
0
                        nargs=2,
                        help='trainX and trainY')
    parser.add_argument('-T', '--no-training', action='store_true')
    parser.add_argument('-s',
                        '--test',
                        nargs=2,
                        help='testing file and the predicted file')
    args = parser.parse_args()

    model_path = args.model_path
    train_file = args.training_file
    training = not args.no_training
    test = args.test

    trainX, trainY, mean, std = utils.load_train_data(train_file[0],
                                                      train_file[1],
                                                      normalize=False)
    trainX, validX, trainY, validY = utils.train_test_split(
        trainX, trainY, 0.1)
    print(
        f'\033[32;1mtrainX: {trainX.shape}, trainY: {trainY.shape}, validX: {validX.shape}, validY: {validY.shape}\033[0m'
    )
    if training:
        model = GradientBoostingClassifier(
            learning_rate=0.1,
            n_estimators=200,
            max_depth=3,
            random_state=880301)  #, n_iter_no_change=10, tol=1e-4)
        model.fit(trainX, trainY.ravel())
        utils.save_model(model_path, model)
        #a = model.feature_importances_[1:].reshape(-1, 9)
Ejemplo n.º 26
0
    def run_temporal(self, checkpoint_dir, vid_dir, frame_ext, out_dir,
                     amplification_factor, fl, fh, fs, n_filter_tap,
                     filter_type):
        """Magnify video with a temporal filter.
        Args:
            checkpoint_dir: checkpoint directory.
            vid_dir: directory containing video frames videos are processed
                in sorted order.
            out_dir: directory to place output frames and resulting video.
            amplification_factor: the amplification factor,
                with 0 being no change.
            fl: low cutoff frequency.
            fh: high cutoff frequency.
            fs: sampling rate of the video.
            n_filter_tap: number of filter tap to use.
            filter_type: Type of filter to use. Can be one of "fir",
                "butter", or "differenceOfIIR". For "differenceOfIIR",
                fl and fh specifies rl and rh coefficients as in Wadhwa et al.
        """

        nyq = fs / 2.0
        if filter_type == 'fir':
            filter_b = firwin(n_filter_tap, [fl, fh], nyq=nyq, pass_zero=False)
            filter_a = []
        elif filter_type == 'butter':
            filter_b, filter_a = butter(n_filter_tap, [fl / nyq, fh / nyq],
                                        btype='bandpass')
            filter_a = filter_a[1:]
        elif filter_type == 'differenceOfIIR':
            # This is a copy of what Neal did. Number of taps are ignored.
            # Treat fl and fh as rl and rh as in Wadhwa's code.
            # Write down the difference of difference equation in Fourier
            # domain to proof this:
            filter_b = [fh - fl, fl - fh]
            filter_a = [-1.0 * (2.0 - fh - fl), (1.0 - fl) * (1.0 - fh)]
        else:
            raise ValueError('Filter type must be either '
                             '["fir", "butter", "differenceOfIIR"] got ' + \
                             filter_type)
        head, tail = os.path.split(out_dir)
        tail = tail + '_fl{}_fh{}_fs{}_n{}_{}'.format(fl, fh, fs, n_filter_tap,
                                                      filter_type)
        out_dir = os.path.join(head, tail)
        vid_name = os.path.basename(out_dir)
        # make folder
        mkdir(out_dir)
        vid_frames = sorted(glob(os.path.join(vid_dir, '*.' + frame_ext)))
        first_frame = vid_frames[0]
        im = imread(first_frame)
        image_height, image_width = im.shape
        if not self.is_graph_built:
            self.image_width = image_width
            self.image_height = image_height
            # Figure out image dimension
            self._build_IIR_filtering_graphs()
            ginit_op = tf.global_variables_initializer()
            linit_op = tf.local_variables_initializer()
            self.sess.run([ginit_op, linit_op])

            if self.load(checkpoint_dir):
                print("[*] Load Success")
            else:
                raise RuntimeError('MagNet: Failed to load checkpoint file.')
            self.is_graph_built = True
        try:
            i = int(self.ckpt_name.split('-')[-1])
            print("Iteration number is {:d}".format(i))
            vid_name = vid_name + '_' + str(i)
        except:
            print("Cannot get iteration number")

        if len(filter_a) is not 0:
            x_state = []
            y_state = []

            for frame in tqdm(vid_frames, desc='Applying IIR'):
                file_name = os.path.basename(frame)
                frame_no, _ = os.path.splitext(file_name)
                frame_no = int(frame_no)
                in_frames = [
                    load_train_data([frame, frame, frame],
                                    gray_scale=self.n_channels == 1,
                                    is_testing=True)
                ]
                in_frames = np.array(in_frames).astype(np.float32)

                texture_enc, x = self.sess.run(
                    [self.texture_enc, self.shape_rep],
                    feed_dict={
                        self.input_image: in_frames[:, :, :, :3],
                    })
                x_state.insert(0, x)
                # set up initial condition.
                while len(x_state) < len(filter_b):
                    x_state.insert(0, x)
                if len(x_state) > len(filter_b):
                    x_state = x_state[:len(filter_b)]
                y = np.zeros_like(x)
                for i in range(len(x_state)):
                    y += x_state[i] * filter_b[i]
                for i in range(len(y_state)):
                    y -= y_state[i] * filter_a[i]
                # update y state
                y_state.insert(0, y)
                if len(y_state) > len(filter_a):
                    y_state = y_state[:len(filter_a)]

                out_amp = self.sess.run(self.output_image,
                                        feed_dict={
                                            self.out_texture_enc:
                                            texture_enc,
                                            self.filtered_enc:
                                            y,
                                            self.ref_shape_enc:
                                            x,
                                            self.amplification_factor:
                                            [amplification_factor]
                                        })

                im_path = os.path.join(out_dir, file_name)
                out_amp = np.squeeze(out_amp)
                out_amp = (127.5 * (out_amp + 1)).astype('uint8')
                cv2.imwrite(im_path,
                            cv2.cvtColor(out_amp, code=cv2.COLOR_RGB2BGR))
        else:
            # This does FIR in fourier domain. Equivalent to cyclic
            # convolution.
            x_state = None
            for i, frame in tqdm(enumerate(vid_frames),
                                 desc='Getting encoding'):
                file_name = os.path.basename(frame)
                in_frames = [
                    load_train_data([frame, frame, frame],
                                    gray_scale=self.n_channels == 1,
                                    is_testing=True)
                ]
                in_frames = np.array(in_frames).astype(np.float32)

                texture_enc, x = self.sess.run(
                    [self.texture_enc, self.shape_rep],
                    feed_dict={
                        self.input_image: in_frames[:, :, :, :3],
                    })
                if x_state is None:
                    x_state = np.zeros(x.shape + (len(vid_frames), ),
                                       dtype='float32')
                x_state[:, :, :, :, i] = x

            filter_fft = np.fft.fft(np.fft.ifftshift(filter_b),
                                    n=x_state.shape[-1])
            # Filtering
            for i in trange(x_state.shape[1], desc="Applying FIR filter"):
                x_fft = np.fft.fft(x_state[:, i, :, :], axis=-1)
                x_fft *= filter_fft[np.newaxis, np.newaxis, np.newaxis, :]
                x_state[:, i, :, :] = np.fft.ifft(x_fft)

            for i, frame in tqdm(enumerate(vid_frames), desc='Decoding'):
                file_name = os.path.basename(frame)
                frame_no, _ = os.path.splitext(file_name)
                frame_no = int(frame_no)
                in_frames = [
                    load_train_data([frame, frame, frame],
                                    gray_scale=self.n_channels == 1,
                                    is_testing=True)
                ]
                in_frames = np.array(in_frames).astype(np.float32)
                texture_enc, _ = self.sess.run(
                    [self.texture_enc, self.shape_rep],
                    feed_dict={
                        self.input_image: in_frames[:, :, :, :3],
                    })
                out_amp = self.sess.run(self.output_image,
                                        feed_dict={
                                            self.out_texture_enc:
                                            texture_enc,
                                            self.filtered_enc:
                                            x_state[:, :, :, :, i],
                                            self.ref_shape_enc:
                                            x,
                                            self.amplification_factor:
                                            [amplification_factor]
                                        })

                im_path = os.path.join(out_dir, file_name)
                out_amp = np.squeeze(out_amp)
                out_amp = (127.5 * (out_amp + 1)).astype('uint8')
                cv2.imwrite(im_path,
                            cv2.cvtColor(out_amp, code=cv2.COLOR_RGB2BGR))
            del x_state

        # Try to combine it into a video
        call([
            DEFAULT_VIDEO_CONVERTER, '-y', '-f', 'image2', '-r', '30', '-i',
            os.path.join(out_dir, '%06d.png'), '-c:v', 'libx264',
            os.path.join(out_dir, vid_name + '.mp4')
        ])
Ejemplo n.º 27
0
import os
import numpy as np
import time
import scipy
from sklearn import svm

from utils import euclidean_dist, read_img, read_txt, load_train_data, load_val_data, get_labels

category = ['aeroplane', 'car', 'horse', 'motorbike',
            'person']  # DON'T MODIFY THIS.
data_dir = '/w11/hyewon/data/practical-category-recognition-2013a/data'

train_imgs, train_idxs = load_train_data(data_dir)


def nonlinear_classifier(features, labels):
    model = svm.NuSVC(nu=1, gamma='auto')
    model.fit(features, labels)

    return model


def Nonlinear_Trainer():
    print("Load the training data...")
    start_time = time.time()
    train_imgs, train_idxs = load_train_data(data_dir)
    del train_imgs
    print("{:.4f} seconds".format(time.time() - start_time))

    print("Extract the image features...")
    train_features = np.load('./train_bow.npy')
Ejemplo n.º 28
0
import utils
from Adam import Adam


def rmse(X, Y, w):
    return np.sqrt(np.mean((X @ w - Y)**2))


def compute_gradient(XTX, XTY, w):
    return 2 * (XTX @ w - XTY)


if __name__ == '__main__':
    trainX, trainY, mean, std = utils.load_train_data(sys.argv[1],
                                                      9,
                                                      preprocessing=False)

    XTX = trainX.T @ trainX
    XTY = trainX.T @ trainY

    for lr in [1., 1e-1, 1e-2, 1e-3]:
        loss = []
        optimizer = Adam(compute_gradient, lr)

        epoches = 40000
        w = np.random.normal(0, 0.05,
                             size=(trainX.shape[1], 1)).astype(np.float32)
        for epoch in range(epoches):
            optimizer.update(XTX, XTY, w)
            if epoch % 100 == 0:
    def train(self, args):

        self.writer = SummaryWriter('logs/' + args.experiment)
        start_time = time.time()
        self.current_epoch = 0
        counter = 0

        mask = torch.from_numpy(1.0 - self.file_mask["mask"][0, :, :, 0]).to(
            self.device)
        batch_idxs = list(
            range(int(floor(float(self.train_set_size) / args.batch_size))))

        for self.current_epoch in range(args.epoch):

            if not args.transfer:
                self.file_trainA.close()
                self.file_trainB.close()
                self.file_training_mask.close()

                print(" [*] Generating training pairs ...")
                self.data_generator.genMask(mask_sampling='random')
                self.data_generator.saveTrainDataset()

                self.file_trainA = h5py.File(self.file_name_trainA, 'r')
                self.file_trainB = h5py.File(self.file_name_trainB, 'r')
                self.file_training_mask = h5py.File(
                    self.file_name_training_mask, 'r')
                mask = torch.from_numpy(
                    1.0 - self.file_training_mask["mask"][0, :, :, 0]).to(
                        self.device)

            shuffle(batch_idxs)
            self.decay_lr(args)

            for idx in range(0, len(batch_idxs)):

                training_batch = load_train_data(batch_idxs[idx], batch_size=args.batch_size, \
                    fileA=self.file_trainA, fileB=self.file_trainB, dataset="train_dataset", device=self.device)

                # Update G network and record fake outputs
                G_loss, pred_prob = self.G_objective(training_batch, mask)

                grad_G = torch.autograd.grad(G_loss,
                                             self.G.parameters(),
                                             create_graph=False,
                                             retain_graph=False)
                for param, grad in zip(self.G.parameters(), grad_G):
                    param.grad = grad
                self.optim_G.step()
                self.writer.add_scalar('G_loss', G_loss, counter + 1)

                # Update D network
                D_loss = self.D_objective(training_batch, pred_prob)

                grad_D = torch.autograd.grad(D_loss,
                                             self.D.parameters(),
                                             create_graph=False)
                for param, grad in zip(self.D.parameters(), grad_D):
                    param.grad = grad
                self.optim_D.step()
                self.writer.add_scalar('D_loss', D_loss, counter + 1)

                counter += 1

                print(((
                    "Epoch: [%d/%d] | Iteration: [%d/%d] | time: %4.2f | generator loss: %f | "
                    + "discriminator loss: %4.8f") %
                       (self.current_epoch + 1, args.epoch, idx + 1,
                        len(batch_idxs), time.time() - start_time, G_loss,
                        D_loss)))

                if np.mod(counter, args.sample_freq) == 0:
                    self.sample_model(args.sample_dir, counter)

                if np.mod(counter, args.save_freq) == 0 or \
                    ((self.current_epoch == args.epoch - 1) and (idx == len(batch_idxs) - 1)):
                    self.save(
                        os.path.join(args.checkpoint_dir, args.experiment),
                        self.current_epoch)
		classifier = algo(random_state = 0, **options)
		classifier.fit(X_train, y_train)

		scores.append(classifier.score(X_test, y_test))

	score = np.mean(scores)
	print "Score on training set (with cross-validation) for %s : %.5f" % (name, score)
	# with open('decisiontree.dot', 'w') as f:
	# 	f = export_graphviz(classifier, out_file = f, feature_names = X.columns.tolist())
	global best_score, best_classifier, best_classifier_name
	if score >= best_score:
		best_score = score
		best_classifier = (algo, options, name)

df, X, Y = load_train_data(formatting_functions)
X = X[features]

test_algo(DecisionTreeClassifier, X, Y, "Decision Tree")
test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with entropy criterion", {'criterion': 'entropy'})
test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with sqrt(features)", {'max_features': 'sqrt'})
test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with max_depth=5", {'max_depth': 5})
test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with min_samples_leaf=5", {'min_samples_leaf': 5})

print
for min_samples in range(1,21):
	for max_depth in range(1,21):
		test_algo(DecisionTreeClassifier, X, Y, "Decision Tree with max_depth=%d and min_samples_leaf=%d" % 
			(max_depth, min_samples), {'max_depth': max_depth, 'min_samples_leaf': min_samples})
print
Ejemplo n.º 31
0
    K.set_session(sess)

    max_seq_len = 32
    w2v_model = Word2Vec().load(word2vec_model_path)
    word2idx = w2v_model.get_word2idx()
    vocabulary_size = len(word2idx)
    print(f'\033[32;1mvocabulary_size: {vocabulary_size}\033[0m')

    if function not in globals():
        globals()[function] = getattr(importlib.import_module(function[:function.rfind('.')]), function.split('.')[-1])
    model = globals()[function]((len(word2idx),))
    model.compile(Adam(1e-3), loss='binary_crossentropy', metrics=['acc'])
    model.summary()

    if training:
        trainX, trainY = utils.load_train_data(labeled_path, word2idx, max_seq_len)
        trainX, validX, trainY, validY = utils.train_test_split(trainX, trainY, split_ratio=0.1)
        validX = bag_of_word(validX, vocabulary_size)
        print(f'\033[32;1mtrainX: {trainX.shape}, validX: {validX.shape}, trainY: {trainY.shape}, validY: {validY.shape}\033[0m')
        checkpoint = ModelCheckpoint(model_path, 'val_loss', verbose=1, save_best_only=True, save_weights_only=True)
        reduce_lr = ReduceLROnPlateau('val_loss', 0.8, 2, verbose=1, min_lr=1e-5)
        #logger = CSVLogger(model_path+'.csv', append=True)
        #tensorboard = TensorBoard(model_path[:model_path.rfind('.')]+'_logs', histogram_freq=1, batch_size=1024, write_grads=True, write_images=True, update_freq=512)
        model.fit_generator(bag_of_word_generator(trainX, trainY, vocabulary_size, batch_size=256), steps_per_epoch=int(np.ceil(trainX.shape[0] / 256)), validation_data=(validX, validY), epochs=10, callbacks=[checkpoint, reduce_lr])

    else:
        print('\033[32;1mLoading Model\033[0m')

    model.load_weights(model_path)
    #sentences = [text_to_word_sequence('today is a good day, but it is hot'), text_to_word_sequence('today is hot, but it is a good day')]
    #sentences = utils.data_preprocessing(sentences, word2idx, max_seq_len)
Ejemplo n.º 32
0
    parser = argparse.ArgumentParser()
    parser.add_argument('data_path')
    parser.add_argument('model_path')
    parser.add_argument('task', type=int)
    parser.add_argument('st', type=int)
    parser.add_argument('ed', type=int)
    parser.add_argument('-s', '--step', default=1)
    args = parser.parse_args()
    data_path = args.data_path
    model_path = args.model_path
    task = args.task
    st = args.st
    ed = args.ed
    step = args.step

    trainX, trainY = utils.load_train_data(data_path)
    trainX, validX, trainY, validY = utils.train_test_split(
        trainX[:, 1:] if task == 1 else
        trainX[:, [0, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13]], trainY)
    print(
        f'\033[32;1mtrainX: {trainX.shape}, validX: {validX.shape}, trainY: {trainY.shape}, validY: {validY.shape}\033[0m'
    )

    if os.path.exists(model_path):
        tree = utils.load_model(model_path)
        print("Load Model")
    else:
        best_train_score, best_valid_score, min_param = 0, 0, 0
        base = 2
        for m in range(st, ed + 1, step):
            r = range(0, 4) if task == 1 else range(3, 7)
#!/usr/bin/env python
# -*- coding: UTF8 -*-

import numpy as np
from utils import load_train_data, output_predictions, add_sex_bit, fill_fare, add_title, test_algo
from sklearn.tree import DecisionTreeClassifier

def logarize_fare(X):
	X['FareLog'] = np.log10(X['Fare'] + 1)

df, X, Y = load_train_data([add_sex_bit, fill_fare, add_title, logarize_fare])

features = ['Pclass', 'SibSp', 'Parch', 'FareLog', 'SexBit', 'Title']

test_algo(DecisionTreeClassifier, X[features], Y, "Decision Tree with Title instead of Age and log(Fare+1)", {'max_depth': 10, 'min_samples_leaf': 8})

classifier = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 8)
classifier.fit(X[features], Y)

output_predictions(classifier, "06_submission.csv", [add_sex_bit, fill_fare, add_title, logarize_fare], features)

print
print "Importance of features:"
print features
print classifier.feature_importances_
Ejemplo n.º 34
0
from utils import load_train_data, dependencies
from optparse import OptionParser
from algorithm import Model
from sklearn import cross_validation
import numpy as np

parser = OptionParser()
parser.add_option("-i", "--input", action="store", type="string", dest="input", default="enrone.txt")
parser.add_option("-n", "--samples", action="store", type="int", dest="N", default=10**5)
parser.add_option("-s", "--selection", action="store", type="string", dest="selection_method", default="none")
parser.add_option("-c", "--cls", action="store", type="string", dest="classifier_type", default="logreg")
parser.add_option("-v", "--vectorizer", action="store", type="string", dest="vectorizer", default="bow")
(options, args) = parser.parse_args()

print 'loading data..'
X, y = load_train_data(options.input)
X, y = X[:options.N], y[:options.N]

print options.classifier_type, options.selection_method

m = Model(classifier_type=options.classifier_type, selection_method=options.selection_method, vectorizer=options.vectorizer)

print "num of training instances: ", len(y)
print "num of training classes: ", len(set(y))
print "performing cross-validation.."

scores = cross_validation.cross_val_score(estimator=m, X=X, y=np.asarray(y), cv=3)

print "3-fold cross-validation results:", "mean score = ", scores.mean(), "std=", scores.std(), ", num folds =", len(scores)
Ejemplo n.º 35
0
                        action='store_true',
                        help='preprocess testing file and the predicted file')
    parser.add_argument('-e',
                        '--ensemble',
                        action='store_true',
                        help='output npy file to ensemble later')
    args = parser.parse_args()

    model_path = args.model_path
    train_file = args.training_file
    training = not args.no_training
    test = args.test
    preprocessing = args.preprocessing
    ensemble = args.ensemble

    trainX, trainY, mean, std = utils.load_train_data(
        train_file[0], train_file[1], preprocessing=preprocessing)
    trainX, validX, trainY, validY = utils.train_test_split(trainX, trainY)
    print(
        f'\033[32;1mtrainX: {trainX.shape}, trainY: {trainY.shape}, validX: {validX.shape}, validY: {validY.shape}\033[0m'
    )
    if training:
        optimizer = Adam(gradient, 1e-4)

        epochs = 150
        batch_size = 64
        w = np.zeros((trainX.shape[1], 1), np.float32)
        for epoch in range(1, epochs + 1):
            idx = np.random.permutation(trainX.shape[0])
            shuffleX, shuffleY = trainX[idx], trainY[idx]
            for i in range(0, idx.shape[0], batch_size):
                batchX, batchY = shuffleX[i:i +
Ejemplo n.º 36
0
def save_random_paragraphs():
    X, y = load_train_data()
    output = open('example_test.txt', 'wb')
    for x in X[:100]:
        output.write(x)
        output.write(PARAGRAPH)
#!/usr/bin/env python
# -*- coding: UTF8 -*-

import numpy as np
import scipy as sp
from utils import load_train_data, test_algo, output_predictions, add_sex_bit, fill_fare, add_title, logarize_fare
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation

formatting = [add_sex_bit, fill_fare, add_title, logarize_fare]
features = ['Pclass', 'SibSp', 'Parch', 'SexBit', 'Title', 'LogFarePlusOne']

df, X, Y = load_train_data(formatting)

# SVM, searching for best value for C
kf = cross_validation.KFold(n=len(X), n_folds=8, indices=True)
grid = GridSearchCV(estimator = SVC(random_state=0), cv = kf, verbose = 1,
					param_grid = dict(C=[0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 5.0, 10.0, 15.0, 30.0, 100.0]))
grid.fit(X[features], Y)

svm_score = grid.best_score_
svm_C = grid.best_params_['C']

print "Best parameters for SVM: C=%5g with score=%.5f" % (svm_C, svm_score)

svm_classifier = SVC(C = svm_C, random_state = 0)
svm_classifier.fit(X[features], Y)