Ejemplo n.º 1
0
def mem_iter(batchsize, feat, fresh=False):
    '''
    memory iter, load all featrue to memory before training
    '''
    # extract all feature
    train_data, train_label, _ = load_data(mode='train', feat_type=feat, fresh=fresh)
    train_data = np.vstack(train_data)
    # mean = np.mean(train_data, axis=0)
    # std = np.std(train_data, axis=0)
    # train_data = (train_data - mean) / std
    dev_data, dev_label, _ = load_data(mode='dev', feat_type=feat, fresh=fresh)
    dev_data = np.vstack(dev_data)
    # mean = np.mean(dev_data, axis=0)
    # std = np.std(dev_data, axis=0)
    # dev_data = (dev_data - mean) / std

    train = DataSet(train_data, np.hstack(train_label))
    print(len(train))

    dev = DataSet(np.vstack(dev_data), np.hstack(dev_label))
    print(len(dev))

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    dev_iter = chainer.iterators.SerialIterator(dev, batchsize, repeat=False, shuffle=False)

    return train_iter, dev_iter
Ejemplo n.º 2
0
def evaluate_models_train_test(path_train,
                               path_test,
                               models=(ID3, KNN, NAIVE_BAYES)):
    """
    evaluate several models with given train and test
    :param models: tuple, models to evaluate
    """
    ds_train = DataSet(path_train)
    ds_test = DataSet(path_test)

    accuracies = []
    model_list = {
        model: get_model(model, 5 if model == KNN else ds_train.header)
        for model in models
    }

    # for each model -> fit train-set and predict test set
    for model_name in models:
        print(model_name + "\n" + "=" * 20)
        acc = fit_predict(model_list[model_name], ds_train, ds_test)
        accuracies.append(acc)
        print("accuracy=" + str(acc))
        print("=" * 20 + "\n")

    return str(model_list[ID3]) + "\n\n" + "\t".join(
        [str(round(v, 2)) for v in accuracies])
Ejemplo n.º 3
0
def plotSavedModel(modeindex):

    # Import Dataset
    modes = DataSet.learningModes
    data = DataSet(modes[modeindex])
    data.print()

    # Network Parameters
    WIDTH = data.WIDTH
    HEIGHT = data.HEIGHT
    CHANNELS = data.CHANNELS_IN
    NUM_INPUTS = WIDTH * HEIGHT * CHANNELS
    NUM_OUTPUTS = data.CHANNELS_OUT

    # Network Varibles and placeholders
    X = tf.placeholder(tf.float32, [None, HEIGHT, WIDTH, CHANNELS])  # Input
    Y = tf.placeholder(
        tf.float32, [None, HEIGHT, WIDTH, NUM_OUTPUTS])  # Truth Data - Output
    global_step = tf.Variable(0,
                              dtype=tf.int32,
                              trainable=False,
                              name='global_step')

    # Define loss and optimizer
    prediction = model.unet(X, NUM_OUTPUTS)

    # Setup Saver
    saver = tf.train.Saver()

    # Initalize varibles, and run network
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)

    ckpt = ckpt = tf.train.get_checkpoint_state('./checkpoints/' +
                                                modes[modeindex])
    if (ckpt and ckpt.model_checkpoint_path):
        print('Restoring Prev. Model ....')
        saver.restore(sess, ckpt.model_checkpoint_path)
        print('Model Loaded....')

        # Show results
        prediction = sess.run(prediction,
                              feed_dict={
                                  X: data.x_test,
                                  Y: data.y_test
                              })

        # Compute metrics of prediction
        metrics = do_metrics(prediction, data)

        # index = np.random.randint(data.x_test.shape[0])
        index = 4
        print('Selecting Test Image #', index)
        plot(data, prediction, modeindex, index)
Ejemplo n.º 4
0
 def __init__(self):
     super(Bm25Search, self).__init__()
     self.__data = [" ".join(ans) for ans in DataSet().answers]
     self.__quest = [" ".join(q) for q in DataSet().questions]
     self.__time = time()
     self.doc_lens, self.av_len = self.__get_av_len()
     self.count_vec = CountVectorizer(input="content", ngram_range=(1, 1))
     self.doc_count = len(self.__data)
     self.__tf_matrix = self.__vectorize_data()
     self.bm32_tf_matrix = self.__get_bm25_tf()
     self.__vocabulary = self.__get_vocabulary()
     self.__word_indexes = self.__get_word_indexes()
     self.idf = self.__get_idfs()
    def test(self):
        """
        evaluate the model for accuracy
        """
        # time at the start of validation
        start = time.time()
        if self.use_cuda:
            self.model.cuda()

        test_data = DataSet(self.order, split='test')
        test_dataloader = data.DataLoader(test_data,
                                          shuffle=True,
                                          batch_size=64,
                                          num_workers=4)

        # Tracking variables
        total_correct, tmp_correct, t_steps = 0, 0, 0

        print("Validation step started...")
        for batch in tqdm(test_dataloader, desc='Batch'):
            batch_cp = copy.deepcopy(batch)
            del batch
            contents, attn_masks, labels = batch_cp
            if self.use_cuda:
                contents = contents.squeeze(1).cuda()
                attn_masks = attn_masks.squeeze(1).cuda()
            keys = self.model.get_keys(contents, attn_masks)
            retrieved_batches = self.memory.get_neighbours(keys.cpu().numpy())
            del keys
            ans_logits = []
            # Iterate over the test batch to calculate label for each document(i.e,content)
            # and store them in a list for comparision later
            for content, attn_mask, (rt_contents, rt_attn_masks,
                                     rt_labels) in tqdm(zip(
                                         contents, attn_masks,
                                         retrieved_batches),
                                                        total=len(contents),
                                                        desc='Refit',
                                                        leave=False):
                if self.use_cuda:
                    rt_contents = rt_contents.cuda()
                    rt_attn_masks = rt_attn_masks.cuda()
                    rt_labels = rt_labels.cuda()

                logits = self.model.infer(content, attn_mask, rt_contents,
                                          rt_attn_masks, rt_labels)

                ans_logits.append(logits.cpu().numpy())
            # Dropping the 1 dim to match the logits' shape
            # shape : (batch_size,num_labels)
            labels = labels.squeeze(1).numpy()
            # print(np.asarray(ans_logits), labels)
            tmp_correct = self.calc_correct(np.asarray(ans_logits), labels)
            # del labels
            total_correct += tmp_correct
            t_steps += len(labels.flatten())
        end = time.time()
        print("Time taken for validation {} minutes".format(
            (end - start) / 60))
        print("Validation Accuracy: {}".format(total_correct / t_steps))
Ejemplo n.º 6
0
def check_naive_bayes():
    ds_ = DataSet("dataset.txt")
    naive_bayes_ = NaiveBayes(ds_.header)
    naive_bayes_.fit(ds_)
    predict, true = naive_bayes_.predict(ds_)

    TP, TN, FP, FN, acc, recall, precision, f1 = get_measures(predict, true)
    print("accuracy:", acc, "\n"
          "recall:", recall, "\n"
          "precision:", precision, "\n"
          "f1:", f1, "\n")
Ejemplo n.º 7
0
def check_knn():
    ds_ = DataSet("dataset.txt")
    knn_ = Knn(5)
    knn_.fit(ds_)
    predict, true = knn_.predict(ds_)

    TP, TN, FP, FN, acc, recall, precision, f1 = get_measures(predict, true)
    print("accuracy:", acc, "\n"
          "recall:", recall, "\n"
          "precision:", precision, "\n"
          "f1:", f1, "\n")
Ejemplo n.º 8
0
    def fit(self, num_epochs, batch_size):
        train_iter = DataSet(batch_size).input_data(self.train_path,
                                                    is_training=True)
        eval_iter = DataSet(batch_size=1).input_data(self.eval_path,
                                                     is_training=False)

        # 이 부분에서 에러가 뜨는데. 아직 원인 못 찾음.
        # num_dataset = sum(1 for _ in tf.python_io.tf_record_iterator(self.train_path))
        num_dataset = 10

        train_image, train_label = train_iter.get_next()
        eval_image, eval_label = eval_iter.get_next()

        with tf.Session() as sess:
            sess.run(train_iter.initializer)
            sess.run(eval_iter.initializer)

            epoch = 0
            while True:
                if epoch <= num_epochs:
                    for step in range(num_dataset // batch_size):
                        image, label = sess.run([train_image, train_label])
                        print(image.shape, label.shape)

                else:
                    break
Ejemplo n.º 9
0
def check_dtl():
    ds_ = DataSet("dataset.txt")
    dtl_ = DecisionTree(ds_.header)
    tree_ = dtl_.fit(ds_)
    predict, true = dtl_.predict(ds_)

    TP, TN, FP, FN, acc, recall, precision, f1 = get_measures(predict, true)
    f = open("tree.txt", "wt")
    f.write(str(dtl_))
    print("accuracy:", acc, "\n"
          "recall:", recall, "\n"
          "precision:", precision, "\n"
          "f1:", f1, "\n")
    e = 0
Ejemplo n.º 10
0
def run():
    configure_logger()
    logger = get_logger()

    dataset = DataSet('../datasets/wiki-new')

    FLAGS = {
        "embedding_length": 10,
        "min_counts": 10,
        "batch_size": 16,
        "hidden_unit_size": 10,
        "learning_rate": .001
    }

    disambiguator = NeuralDisambiguator(dataset,
                                        FLAGS,
                                        use_pretrained_embeddings=False)
    disambiguator.fit(max_steps=2000)
Ejemplo n.º 11
0
def cross_validation(path_data, model, k=5):
    """
    perform K-fold cross validation on model & data-set
    :return: average accuracy
    """
    ds = DataSet(path_data)
    arg = 5 if model == KNN else ds.header

    accuracies = []
    # fit K times + check accuracy
    for i, (train, test) in enumerate(k_fold(k, ds)):
        acc = fit_predict(get_model(model, arg), train, test)
        print("fold " + str(i) + ": accuracy=" + str(acc))
        accuracies.append(acc)

    # return average
    aggregate_acc = sum(accuracies) / len(accuracies)
    print("aggregate accuracy=" + str(aggregate_acc))
    return aggregate_acc
Ejemplo n.º 12
0
def main():
    #main function
    Epoch = cfg.epoch
    LR = cfg.lr
    B_size = cfg.train_batch_size
    #test_every_epoch = cfg.test_every_epoch
    #VALID = False
    model_name = [
        'alex', 'vgg16', 'vggf', 'cc', 'cct', 'incep', 'net', 'region_alex'
    ]
    start = time.time()
    model, criterion = build_model(model_type=cfg.model_type)
    dataset = DataSet(cfg)
    logfile = 'logfile/' + model_name[cfg.model_type] + '_train.txt'
    with open(logfile, 'w') as fp:
        fp.close()

    print('Start Training!')
    for epoch in range(Epoch):
        LR = adjust_learning_rate(LR, epoch)
        optimizer = optim.Adam(model.parameters(), lr=LR)
        train(dataset.train_loader, model, criterion, optimizer, epoch,
              logfile)
        if ((epoch + 1) % cfg.test_every_epoch == 0):
            print('validating......')
            pre, tar = valid(dataset.test_loader, model, criterion)
            pre = np.array(pre)
            tar = np.array(tar)
            np.savetxt(
                'results/' + model_name[cfg.model_type] + str(epoch + 1) +
                '.txt', pre)
            np.savetxt(
                'results/' + model_name[cfg.model_type] + str(epoch + 1) +
                '_tar.txt', tar)
            print('Accuracy: ', acc(pre, tar))
            print('Saving model...')
            torch.save(model.state_dict(),
                       'results/' + model_name[cfg.model_type] + '_ck.pth')
    print('Done!')
Ejemplo n.º 13
0
def runNetwork(modeindex, doRestore=False):

    # Import Dataset
    modes = DataSet.learningModes
    data = DataSet(modes[modeindex])
    data.print()

    # Training Parameters
    learning_rate = 1e-4
    num_steps = 30000
    batch_size = 16
    display_step = 500
    save_step = 10000

    # Network Parameters
    WIDTH = data.WIDTH
    HEIGHT = data.HEIGHT
    CHANNELS = data.CHANNELS_IN
    NUM_INPUTS = WIDTH * HEIGHT * CHANNELS
    NUM_OUTPUTS = data.CHANNELS_OUT

    # Network Varibles and placeholders
    X = tf.placeholder(tf.float32, [None, HEIGHT, WIDTH, CHANNELS])  # Input
    Y = tf.placeholder(
        tf.float32, [None, HEIGHT, WIDTH, NUM_OUTPUTS])  # Truth Data - Output
    global_step = tf.Variable(0,
                              dtype=tf.int32,
                              trainable=False,
                              name='global_step')

    # Define loss and optimizer
    prediction = model.unet(X, NUM_OUTPUTS)
    loss = tf.reduce_mean(tf.square(prediction - Y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    trainer = optimizer.minimize(loss, global_step=global_step)

    # Setup Saver
    saver = tf.train.Saver()

    # Initalize varibles, and run network
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)

    if (doRestore):
        ckpt = ckpt = tf.train.get_checkpoint_state('./checkpoints/' +
                                                    modes[modeindex])
        if (ckpt and ckpt.model_checkpoint_path):
            print('Restoring Prev. Model ....')
            saver.restore(sess, ckpt.model_checkpoint_path)
            print('Model Loaded....')

    print('Start Training: BatchSize:', batch_size, ' LearningRate:',
          learning_rate)

    # Train network
    _step = []
    _loss_train = []
    _loss_test = []

    t0 = time()
    for _ in range(num_steps):
        batch_xs, batch_ys = data.next_batch(batch_size)
        sess.run(trainer, feed_dict={X: batch_xs, Y: batch_ys})

        step = sess.run(global_step)

        if (step % display_step == 0):
            train_loss = sess.run(loss, feed_dict={X: batch_xs, Y: batch_ys})
            test_loss = sess.run(loss,
                                 feed_dict={
                                     X: data.x_test,
                                     Y: data.y_test
                                 })
            print("Step: " + str(step) + " Train Loss: %.4e" % train_loss +
                  " Test Loss: %.4e" % test_loss + " TIME: %g" % (time() - t0))
            _step.append(step)
            _loss_test.append(test_loss)
            _loss_train.append(train_loss)

        if (step % save_step == 0):
            saver.save(sess,
                       './checkpoints/' + modes[modeindex] + '/' +
                       modes[modeindex],
                       global_step=global_step)

    # Show results
    prediction = sess.run(prediction,
                          feed_dict={
                              X: data.x_test,
                              Y: data.y_test
                          })

    plot(data, prediction, modeindex, 0)

    # Plot loss
    plt.plot(_step, np.log10(_loss_train), label='training loss')
    plt.plot(_step, np.log10(_loss_test), label='test loss')
    plt.title('Mean Squared Error (MSE)')
    plt.xlabel('Epoches')
    plt.ylabel('ln(MSE)')
    plt.legend()
    plt.show()
Ejemplo n.º 14
0
cropped_ground_truth_path_test = '/run/media/henryp/HenryHDD/DataSets/CMU/Formatted/Test/Cropped_data/'

if run_on == 'GPU':
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

print('============')
print('Training on: ' + str(device))
print('============')

training_partition, training_set_size = load_data(
    ground_truth_dir=cropped_ground_truth_path_train, type='train')

training_set = DataSet(list_IDS=training_partition['train'],
                       data_dir=cropped_ground_truth_path_train,
                       clip_length=64)
training_generator = data.DataLoader(training_set, **params)

testing_partition, testing_set_size = load_data(
    ground_truth_dir=cropped_ground_truth_path_test, type='test')

testing_set = DataSet(list_IDS=testing_partition['test'],
                      data_dir=cropped_ground_truth_path_test,
                      clip_length=64)

testing_generator = data.DataLoader(testing_set, **params)

# load the network class
# input size is the number of features in your input data
net = CNNLSTMAE(num_frames=64, num_layers=2)
Ejemplo n.º 15
0
    data = pickle.load(open(data_cache_file, "rb"))

    data['name'] = data_name
    #get model
    model = get_model(config['MODEL']['name'], classes,mode='test')
    model.load_state_dict(torch.load(weight_file))
    model.cuda()
    model.eval()

    # define optimization criteria
    weight = torch.from_numpy(data['classWeights'])  # convert the numpy array to torch
    weight = weight.cuda()
    criteria = CrossEntropyLoss2d()  # weight


    valDataset = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(width, height),
        myTransforms.ToTensor(1),
        #
    ])
    val_data_loader = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['valIm'], data['valAnnot'], transform=valDataset,data_name=data['name']),
        batch_size=1, shuffle=False, num_workers=1, pin_memory=True)

    cudnn.benchmark = True
    start_epoch = 0

    overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = val(classes, val_data_loader, model,criteria,up,ignore_label)
    print mIOU_val
    print per_class_iu_val
Ejemplo n.º 16
0
def train(order, model, memory):
    """
    Train function
    """
    workers = 0
    if use_cuda:
        model.cuda()
        # Number of workers should be 4*num_gpu_available
        # https://discuss.pytorch.org/t/guidelines-for-assigning-num-workers-to-dataloader/813/5
        workers = 4
    # time at the start of training
    start = time.time()

    train_data = DataSet(order, split='train')
    train_sampler = data.SequentialSampler(train_data)
    train_dataloader = data.DataLoader(train_data,
                                       sampler=train_sampler,
                                       batch_size=args.batch_size,
                                       num_workers=workers)
    param_optimizer = list(model.classifier.named_parameters())
    # parameters that need not be decayed
    no_decay = ['bias', 'gamma', 'beta']
    # Grouping the parameters based on whether each parameter undergoes decay or not.
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                   lr=LEARNING_RATE)

    # Store our loss and accuracy for plotting
    train_loss_set = []
    # trange is a tqdm wrapper around the normal python range
    # for epoch in trange(args.epochs, desc="Epoch"):
    for epoch in range(args.epochs):
        # Training begins
        print("Training begins")
        # Set our model to training mode (as opposed to evaluation mode)
        model.classifier.train()
        # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps, num_curr_exs = 0, 0, 0
        # Train the data for one epoch
        # for step, batch in enumerate(tqdm(train_dataloader)):
        for step, batch in enumerate(train_dataloader):
            # Release file descriptors which function as shared
            # memory handles otherwise it will hit the limit when
            # there are too many batches at dataloader
            batch_cp = copy.deepcopy(batch)
            del batch
            # Perform sparse experience replay after every REPLAY_FREQ steps
            if (step + 1) % REPLAY_FREQ == 0:
                # sample 64 examples from memory
                content, attn_masks, labels = memory.sample(sample_size=32)
                if use_cuda:
                    content = content.cuda()
                    attn_masks = attn_masks.cuda()
                    labels = labels.cuda()
                # Clear out the gradients (by default they accumulate)
                optimizer.zero_grad()
                # Forward pass
                loss, logits = model.classify(content, attn_masks, labels)
                train_loss_set.append(loss.item())
                # Backward pass
                loss.backward()
                # Update parameters and take a step using the computed gradient
                optimizer.step()

                # Update tracking variables
                tr_loss += loss.item()
                nb_tr_examples += content.size(0)
                nb_tr_steps += 1

                del content
                del attn_masks
                del labels
                del loss
            # Unpacking the batch items
            content, attn_masks, labels = batch_cp
            content = content.squeeze(1)
            attn_masks = attn_masks.squeeze(1)
            labels = labels.squeeze(1)
            # number of examples in the current batch
            num_curr_exs = content.size(0)
            # Place the batch items on the appropriate device: cuda if avaliable
            if use_cuda:
                content = content.cuda()
                attn_masks = attn_masks.cuda()
                labels = labels.cuda()
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass
            loss, _ = model.classify(content, attn_masks, labels)
            train_loss_set.append(loss.item())
            # Get the key representation of documents
            keys = model.get_keys(content, attn_masks)
            # Push the examples into the replay memory
            memory.push(keys.cpu().numpy(),
                        (content.cpu().numpy(), attn_masks.cpu().numpy(),
                         labels.cpu().numpy()))
            # delete the batch data to freeup gpu memory
            del keys
            del content
            del attn_masks
            del labels
            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += num_curr_exs
            nb_tr_steps += 1

        now = time.time()
        print("Train loss: {}".format(tr_loss / nb_tr_steps))
        print("Time taken till now: {} hours".format((now - start) / 3600))
        model_dict = model.save_state()
        save_checkpoint(model_dict, order, epoch + 1, memory=memory.memory)

    save_trainloss(train_loss_set, order)
import cv2
import sys
from data_loader import DataSet
from config import batch_size

data = DataSet()
X, Y = data.train_batch(batch_size)

image = cv2.imread(
    "../stage1_train/00071198d059ba7f5914a526d124d28e6d010c92466da21d4a04cd5413362552/images/00071198d059ba7f5914a526d124d28e6d010c92466da21d4a04cd5413362552.png"
)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
ret, thresh1 = cv2.threshold(image, 40, 255,
                             cv2.THRESH_BINARY)  # threshold range 12-44

fig = plt.figure()
a = fig.add_subplot(1, 2, 1)
plt.imshow(thresh1, cmap='gray')
a = fig.add_subplot(1, 2, 2)
plt.imshow(image, cmap='gray')

plt.show()
Ejemplo n.º 18
0
 def __init__(self):
     super(WordToVecSearch, self).__init__()
     self._time = 0
     self.__model = self.__load_model()
     self.__data = DataSet()
     self.__corpus_matrix = self.__build_up_matrix()
Ejemplo n.º 19
0
 def __init__(self):
     super(ElmoSearch, self).__init__()
     self.__time = 0
     self.__data = DataSet()
     self.__batcher, self.__ids, self.__sent_input = self.__load_model()
     self.__corpus_matrix = self.__build_up_matrix()
Ejemplo n.º 20
0
def app(train_or_test):
    if train_or_test == 'train':
        start_time = time.time()
        X_train, y_train = train_data.load_all()
        y_train = keras.utils.to_categorical(y_train, num_classes=num_classes)
        print('load training used time:', time.time() - start_time)
        print(X_train.shape)
        print(y_train.shape)
        training(X_train, y_train)

    if train_or_test == 'test':
        X_test, y_test = chars.test.load_all()
        testing(X_test, y_test)


if __name__ == '__main__':
    train_data = DataSet(r'/Users/megatron/DL/train_preproc/**/*jpg')
    test_data = DataSet(r'/Users/megatron/DL/test/**/*jpg')
    test_data.use_rotation = False
    test_data.use_filter = False
    num_classes = 100
    train_set, valid_set = train_data.train_valid_split()
    X_train, y_train = train_data.load_all(train_set)
    y_train = keras.utils.to_categorical(y_train, num_classes=num_classes)
    X_valid, y_valid = train_data.load_all(valid_set)
    y_valid = keras.utils.to_categorical(y_valid, num_classes=num_classes)
    print(X_valid.shape)
    print(y_valid.shape)
    training(X_train, y_train, X_valid, y_valid)
Ejemplo n.º 21
0
def load_data(train_path, test_path):
    """
    load train and test data-sets by files path
    """
    return DataSet(train_path), DataSet(test_path)