def evaluate(model, files, epoch=0, number_of_process=1):
    cnn = model.get_cnn()
    bs = model.get_batch_size()
    logger = logging.getLogger("trainer")

    queue = torch.multiprocessing.Queue(maxsize=QUEUE_SIZE)
    event_done = torch.multiprocessing.Event()

    class Batcher(torch.multiprocessing.Process):
        def __init__(self, n=1, i=0):
            super().__init__(daemon=True)
            self.n = n
            self.i = i

        def run(self):
            s = 0

            for i in range(0, len(files), bs):
                if s % self.n == self.i:
                    j = min(i + bs, len(files))
                    gc.collect()
                    x = model.load_eval_files(files[i:j])

                    queue.put((s, x))
                s += 1
            event_done.wait()

    for i in range(number_of_process):
        batcher = Batcher(number_of_process, i)
        batcher.start()

    cnn.eval()
    if torch.cuda.is_available():
        cnn.cuda()

    all_outputs = [None] * len(range(0, len(files), bs))

    for i in range(0, len(files), bs):
        gc.collect()
        s, x = queue.get()

        x = torch.FloatTensor(x)

        if torch.cuda.is_available():
            x = x.cuda()

        outputs = model.evaluate(x)

        all_outputs[s] = outputs

        logger.info("Evaluation [%d.%.2d|%d/%d] Memory=%s Queue=%d", epoch,
                    100 * i // len(files), i, len(files),
                    gpu_memory.format_memory(gpu_memory.used_memory()),
                    queue.qsize())

        del s
        del x
        del outputs
    event_done.set()
    return np.concatenate(all_outputs, axis=0)
def train_one_epoch(epoch, model, train_files, optimizer, criterion,
                    number_of_process):
    cnn = model.get_cnn()
    bs = model.get_batch_size(epoch)
    logger = logging.getLogger("trainer")

    indicies = list(range(len(train_files)))
    random.shuffle(indicies)

    queue = torch.multiprocessing.Queue(maxsize=QUEUE_SIZE)
    event_done = torch.multiprocessing.Event()

    class Batcher(torch.multiprocessing.Process):
        def __init__(self, n=1, i=0):
            super().__init__(daemon=True)
            self.n = n
            self.i = i

        def run(self):
            s = 0

            for i in range(0, len(train_files), bs):
                if s % self.n == self.i:
                    j = min(i + bs, len(train_files))
                    gc.collect()
                    x, y = model.load_train_files(
                        [train_files[g] for g in indicies[i:j]])

                    queue.put((x, y))
                s += 1
            event_done.wait()

    for i in range(number_of_process):
        batcher = Batcher(number_of_process, i)
        batcher.start()

    losses = []

    cnn.train()
    if torch.cuda.is_available():
        cnn.cuda()

    for i in range(0, len(train_files), bs):
        t0 = perf_counter()
        gc.collect()

        t = time_logging.start()

        x, y = queue.get()

        x = torch.FloatTensor(x)
        y = torch.FloatTensor(y)

        x = torch.autograd.Variable(x)
        y = torch.autograd.Variable(y)

        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()

        t = time_logging.end("batch", t)

        optimizer.zero_grad()
        outputs = cnn(x)
        loss = criterion(outputs, y)
        t = time_logging.end("forward", t)
        loss.backward()
        optimizer.step()

        t = time_logging.end("backward", t)

        loss_ = float(loss.data.cpu().numpy())
        losses.append(loss_)

        logger.info(
            "[%d.%.2d|%d/%d] RMSE=%.1e <RMSE>=%.1e Queue=%d Memory=%s Time=%.2fs",
            epoch, 100 * i // len(train_files), i, len(train_files),
            loss_**0.5,
            np.mean(losses)**0.5, queue.qsize(),
            gpu_memory.format_memory(gpu_memory.used_memory()),
            perf_counter() - t0)

        del x
        del y
        del outputs
        del loss

    event_done.set()
    return np.mean(losses)
Example #3
0
def train_one_epoch(epoch, model, train_files, train_labels, optimizer,
                    criterion, number_of_process, queue_size):
    cnn = model.get_cnn()
    logger = logging.getLogger("trainer")

    batches = model.create_train_batches(
        epoch, train_files,
        train_labels)  # list of lists [first batch, second batch, ...]

    queue = torch.multiprocessing.Queue(maxsize=queue_size)
    event_done = torch.multiprocessing.Event()

    class Batcher(torch.multiprocessing.Process):
        def __init__(self, n=1, i=0):
            super().__init__(daemon=True)
            self.n = n
            self.i = i

        def run(self):
            for s, batch in enumerate(batches):
                if s % self.n == self.i:
                    gc.collect()
                    x, y = model.load_train_batch(batch)

                    queue.put((x, y))

            event_done.wait()

    for i in range(number_of_process):
        batcher = Batcher(number_of_process, i)
        batcher.start()

    losses = []
    total_correct = 0
    total_trained = 0

    cnn.train()
    if torch.cuda.is_available():
        cnn.cuda()

    for s, batch in enumerate(batches):
        t0 = perf_counter()
        gc.collect()

        t = time_logging.start()

        x, y = queue.get()

        x = torch.autograd.Variable(x)
        y = torch.autograd.Variable(y)

        t = time_logging.end("load batch", t)

        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()

        t = time_logging.end("upload batch", t)

        optimizer.zero_grad()
        outputs = cnn(x)
        loss = criterion(outputs, y)
        t = time_logging.end("forward", t)
        loss.backward()
        optimizer.step()

        t = time_logging.end("backward", t)

        loss_ = float(loss.data.cpu().numpy())
        losses.append(loss_)
        if outputs.size(-1) > 1:
            if y.dim() == 1:
                correct = sum(outputs.data.cpu().numpy().argmax(-1) ==
                              y.data.cpu().numpy())
            else:
                correct = sum(outputs.data.cpu().numpy().argmax(-1) ==
                              y.data.cpu().numpy().argmax(-1))
        else:
            correct = np.sum(
                np.sign(outputs.data.cpu().numpy().reshape((
                    -1, ))) == 2 * y.data.cpu().numpy() - 1)
        total_correct += correct
        total_trained += len(batch)

        logger.info(
            "[%d.%.2d|%d/%d] Loss=%.1e <Loss>=%.1e Accuracy=%d/%d <Accuracy>=%.2f%% Queue=%d Memory=%s Time=%.2fs",
            epoch, 100 * s // len(batches), s, len(batches), loss_,
            np.mean(losses), correct, len(batch),
            100 * total_correct / total_trained, queue.qsize(),
            gpu_memory.format_memory(gpu_memory.used_memory()),
            perf_counter() - t0)

        del x
        del y
        del outputs
        del loss

    event_done.set()
    return (np.mean(losses), total_correct / total_trained)