Python splitBatches Examples

Programming Language: Python

Namespace/Package Name: utils.utilities

Method/Function: splitBatches

Examples at hotexamples.com: 4

Python splitBatches - 4 examples found. These are the top rated real world Python examples of utils.utilities.splitBatches extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: solver_general.py Project: hungntt/ChessCommentaryGeneration

    def splitData(self, data, batch_size=-1):

        if batch_size == -1:
            batch_size = self.cnfg.batch_size

        data_inputs = data[0]
        m = len(data_inputs[0])
        data_targets = data[1]
        cnfg = self.cnfg

        all_data = []
        all_masks = []
        for j in range(m):
            src, src_masks = torch_utils.splitBatches(
                train=[data_input[j] for data_input in data_inputs],
                batch_size=batch_size,
                padSymbol=cnfg.garbage,
                method="pre")
            all_data.append(src)
            all_masks.append(src_masks)
        tgt_batches, tgt_masks = torch_utils.splitBatches(
            train=data_targets,
            batch_size=batch_size,
            padSymbol=cnfg.garbage,
            method="post")

        return [all_data, all_masks], [tgt_batches, tgt_masks]

Example #2

Show file

File: solver_lm.py Project: hungntt/ChessCommentaryGeneration

 def score(self, idList):
     test_tgt_batches, test_tgt_masks = torch_utils.splitBatches(
         train=[
             idList,
         ],
         batch_size=1,
         padSymbol=self.cnfg.garbage,
         method="post")
     loss = self.model.forward(test_tgt_batches[0],
                               test_tgt_masks[0],
                               self.loss_function,
                               inference=True)
     return loss.data.cpu().numpy()[0]

Example #3

Show file

    def main(self):

        # Saving object variables as locals for quicker access
        cnfg = self.cnfg
        modelName = self.cnfg.modelName
        readData = self.cnfg.readData
        srcLangObj = self.cnfg.srcLangObj
        tgtLangObj = self.cnfg.tgtLangObj
        wids_src = srcLangObj.wids
        wids_tgt = tgtLangObj.wids

        train_src = srcLangObj.read_corpus("train")
        train_tgt = tgtLangObj.read_corpus("train")

        if cnfg.mode != "inference":
            valid_src = srcLangObj.read_corpus(mode="valid")
            valid_tgt = tgtLangObj.read_corpus(mode="valid")

        test_src = srcLangObj.read_corpus(mode="test")
        test_tgt = tgtLangObj.read_corpus(mode="test")

        train_src, train_tgt = train_src[:cnfg.
                                         max_train_sentences], train_tgt[:cnfg.
                                                                         max_train_sentences]
        print
        "src vocab size:", len(wids_src)
        print
        "tgt vocab size:", len(wids_tgt)
        print
        "training size:", len(train_src)
        if cnfg.mode != "inference":
            print
            "valid size:", len(valid_src)

        train = zip(train_src, train_tgt)  # zip(train_src,train_tgt)
        if cnfg.mode != "inference":
            valid = zip(valid_src, valid_tgt)  # zip(train_src,train_tgt)

        train.sort(key=lambda x: len(x[0]))

        if cnfg.mode != "inference":
            valid.sort(key=lambda x: len(x[0]))

        train_src, train_tgt = [x[0] for x in train], [x[1] for x in train]

        if cnfg.mode != "inference":
            valid_src, valid_tgt = [x[0] for x in valid], [x[1] for x in valid]

        train_src_batches, train_src_masks = torch_utils.splitBatches(
            train=train_src,
            batch_size=cnfg.batch_size,
            padSymbol=cnfg.garbage,
            method="pre")
        train_tgt_batches, train_tgt_masks = torch_utils.splitBatches(
            train=train_tgt,
            batch_size=cnfg.batch_size,
            padSymbol=cnfg.garbage,
            method="post")
        if cnfg.mode != "inference":
            valid_src_batches, valid_src_masks = torch_utils.splitBatches(
                train=valid_src,
                batch_size=cnfg.batch_size,
                padSymbol=cnfg.garbage,
                method="pre")
            valid_tgt_batches, valid_tgt_masks = torch_utils.splitBatches(
                train=valid_tgt,
                batch_size=cnfg.batch_size,
                padSymbol=cnfg.garbage,
                method="post")

        test_src_batches, test_src_masks = torch_utils.splitBatches(
            train=test_src, batch_size=1, padSymbol=cnfg.garbage, method="pre")
        test_tgt_batches, test_tgt_masks = torch_utils.splitBatches(
            train=test_tgt,
            batch_size=1,
            padSymbol=cnfg.garbage,
            method="post")

        # Dump useless references
        train = None
        valid = None
        # Sanity check
        assert (len(train_tgt_batches) == len(train_src_batches))
        if cnfg.mode != "inference":
            assert (len(valid_tgt_batches) == len(valid_src_batches))
        assert (len(test_tgt_batches) == len(test_src_batches))

        print("Training Batches:", len(train_tgt_batches))
        if cnfg.mode != "inference":
            print("Validation Batches:", len(valid_tgt_batches))
        print("Test Points:", len(test_src_batches))

        if cnfg.cudnnBenchmark:
            torch.backends.cudnn.benchmark = True
        # Declare model object
        print("Declaring Model, Loss, Optimizer")
        model = SeqToSeqAttn(cnfg, wids_src=wids_src, wids_tgt=wids_tgt)
        loss_function = nn.NLLLoss(ignore_index=1, size_average=False)
        if torch.cuda.is_available():
            model.cuda()
            loss_function = loss_function.cuda()
        optimizer = None
        if cnfg.optimizer_type == "SGD":
            optimizer = optim.SGD(model.getParams(), lr=0.05)
        elif cnfg.optimizer_type == "ADAM":
            optimizer = optim.Adam(model.getParams())

        if cnfg.mode == "trial":
            print("Running Sample Batch")
            print("Source Batch Shape:", train_src_batches[30].shape)
            print("Source Mask Shape:", train_src_masks[30].shape)
            print("Target Batch Shape:", train_tgt_batches[30].shape)
            print("Target Mask Shape:", train_tgt_masks[30].shape)
            sample_src_batch = train_src_batches[30]
            sample_tgt_batch = train_tgt_batches[30]
            sample_mask = train_tgt_masks[30]
            sample_src_mask = train_src_masks[30]
            print(datetime.datetime.now())
            model.zero_grad()
            loss = model.forward(sample_src_batch, sample_tgt_batch,
                                 sample_src_mask, sample_mask, loss_function)
            print(loss)
            loss.backward()
            optimizer.step()
            print(datetime.datetime.now())
            print("Done Running Sample Batch")

        train_batches = zip(train_src_batches, train_tgt_batches,
                            train_src_masks, train_tgt_masks)
        if cnfg.mode != "inference":
            valid_batches = zip(valid_src_batches, valid_tgt_batches,
                                valid_src_masks, valid_tgt_masks)

        train_src_batches, train_tgt_batches, train_src_masks, train_tgt_masks = None, None, None, None
        if cnfg.mode != "inference":
            valid_src_batches, valid_tgt_batches, valid_src_masks, valid_tgt_masks = None, None, None, None

        if cnfg.mode == "train" or cnfg.mode == "LM":
            print("Start Time:", datetime.datetime.now())
            for epochId in range(cnfg.NUM_EPOCHS):
                random.shuffle(train_batches)
                for batchId, batch in enumerate(train_batches):
                    src_batch, tgt_batch, src_mask, tgt_mask = batch[0], batch[
                        1], batch[2], batch[3]
                    batchLength = src_batch.shape[1]
                    batchSize = src_batch.shape[0]
                    tgtBatchLength = tgt_batch.shape[1]
                    if batchLength < cnfg.MAX_SEQ_LEN and batchSize > 1 and tgtBatchLength < cnfg.MAX_TGT_SEQ_LEN:
                        model.zero_grad()
                        loss = model.forward(src_batch, tgt_batch, src_mask,
                                             tgt_mask, loss_function)
                        if cnfg.mem_optimize:
                            del src_batch, tgt_batch, src_mask, tgt_mask
                        loss.backward()
                        if cnfg.mem_optimize:
                            del loss
                        optimizer.step()
                    if batchId % cnfg.PRINT_STEP == 0:
                        print("Batch No:", batchId, " Time:",
                              datetime.datetime.now())

                totalValidationLoss = 0.0
                NUM_TOKENS = 0.0
                for batchId, batch in enumerate(valid_batches):
                    src_batch, tgt_batch, src_mask, tgt_mask = batch[0], batch[
                        1], batch[2], batch[3]
                    model.zero_grad()
                    loss = model.forward(src_batch,
                                         tgt_batch,
                                         src_mask,
                                         tgt_mask,
                                         loss_function,
                                         inference=True)
                    if cnfg.normalizeLoss:
                        totalValidationLoss += (
                            loss.data.cpu().numpy()) * np.sum(tgt_mask)
                    else:
                        totalValidationLoss += (loss.data.cpu().numpy())
                    NUM_TOKENS += np.sum(tgt_mask)
                    if cnfg.mem_optimize:
                        del src_batch, tgt_batch, src_mask, tgt_mask, loss

                model.save_checkpoint(modelName + "_" + str(epochId),
                                      optimizer)

                perplexity = math.exp(totalValidationLoss / NUM_TOKENS)
                print("Epoch:", epochId, " Total Validation Loss:",
                      totalValidationLoss, " Perplexity:", perplexity)
            print("End Time:", datetime.datetime.now())

        elif cnfg.mode == "inference":
            if cnfg.method == "OSOM":
                import levenshtein as levenshtein
                train_src = train_src[:10000]  # [:500]
                train_tgt = train_tgt[:10000]  # [:500]
                trainIndex = {}
                for i in range(len(train_src)):
                    trainIndex[i] = (train_src[i], train_tgt[i])
                testIndex = {}
                fineTuneBatches = {}
                for i in range(len(test_src)):
                    if i % 300 == 0:
                        print("Computed Similarity Upto:", i)
                    simValues = []
                    for j in trainIndex:
                        simValue = levenshtein.levenshtein(
                            trainIndex[j][0], test_src[i])
                        simValues.append((j, simValue))
                    simValues.sort(key=lambda x: x[1])
                    simValues = simValues[:4]
                    # print simValues
                    simValues = [x[0] for x in simValues]
                    if len(simValues) % 2 == 1:
                        # If odd, make it even by giving double importance to most similar sentence.
                        simValues.append(simValues[0])
                    train_src = [trainIndex[x][0] for x in simValues]
                    train_tgt = [trainIndex[x][1] for x in simValues]

                    # print i,":",simValues
                    train_src_batches, train_src_masks = torch_utils.splitBatches(
                        train=train_src,
                        batch_size=len(train_src),
                        padSymbol=cnfg.garbage,
                        method="pre")
                    train_tgt_batches, train_tgt_masks = torch_utils.splitBatches(
                        train=train_tgt,
                        batch_size=len(train_src),
                        padSymbol=cnfg.garbage,
                        method="post")
                    testIndex[i] = zip(train_src_batches, train_tgt_batches,
                                       train_src_masks, train_tgt_masks)
                    # print testIndex[i][0]
                    # print testIndex[i][1]
                print("Done loading similarity matrix")
                model.load_from_checkpoint(modelName)
                model.decodeAll(test_src_batches,
                                modelName,
                                method=cnfg.method,
                                evalMethod="BLEU",
                                suffix="test",
                                testIndex=testIndex,
                                loss_function=loss_function,
                                optimizer=optimizer)
                exit()
            model.load_from_checkpoint(modelName)
            # Evaluate on test first
            model.decodeAll(test_src_batches,
                            modelName,
                            method=cnfg.method,
                            evalMethod="BLEU",
                            suffix="test",
                            lmObj=self.cnfg.lmObj)

Example #4

Show file

File: solver_lm.py Project: hungntt/ChessCommentaryGeneration

    def main(self):

        # Saving object variables as locals for quicker access
        cnfg = self.cnfg
        modelName = self.cnfg.modelName
        readData = self.cnfg.readData
        srcLangObj = self.cnfg.srcLangObj
        tgtLangObj = self.cnfg.tgtLangObj
        wids_src = srcLangObj.wids
        wids_tgt = tgtLangObj.wids

        train_src = srcLangObj.read_corpus("train")
        train_tgt = tgtLangObj.read_corpus("train")

        if cnfg.mode != "inference":
            valid_src = srcLangObj.read_corpus(mode="valid")
            valid_tgt = tgtLangObj.read_corpus(mode="valid")

        test_src = srcLangObj.read_corpus(mode="test")
        test_tgt = tgtLangObj.read_corpus(mode="test")

        train_src, train_tgt = train_src[:cnfg.
                                         max_train_sentences], train_tgt[:cnfg.
                                                                         max_train_sentences]
        print("src vocab size:", len(wids_src))
        print("tgt vocab size:", len(wids_tgt))
        print("training size:", len(train_src))
        if cnfg.mode != "inference":
            print("valid size:", len(valid_src))

        train = zip(train_src, train_tgt)  # zip(train_src,train_tgt)
        if cnfg.mode != "inference":
            valid = zip(valid_src, valid_tgt)  # zip(train_src,train_tgt)

        train.sort(key=lambda x: len(x[0]))

        if cnfg.mode != "inference":
            valid.sort(key=lambda x: len(x[0]))

        train_src, train_tgt = [x[0] for x in train], [x[1] for x in train]

        if cnfg.mode != "inference":
            valid_src, valid_tgt = [x[0] for x in valid], [x[1] for x in valid]

        train_src_batches, train_src_masks = torch_utils.splitBatches(
            train=train_src,
            batch_size=cnfg.batch_size,
            padSymbol=cnfg.garbage,
            method="pre")
        train_tgt_batches, train_tgt_masks = torch_utils.splitBatches(
            train=train_tgt,
            batch_size=cnfg.batch_size,
            padSymbol=cnfg.garbage,
            method="post")
        if cnfg.mode != "inference":
            valid_src_batches, valid_src_masks = torch_utils.splitBatches(
                train=valid_src,
                batch_size=cnfg.batch_size,
                padSymbol=cnfg.garbage,
                method="pre")
            valid_tgt_batches, valid_tgt_masks = torch_utils.splitBatches(
                train=valid_tgt,
                batch_size=cnfg.batch_size,
                padSymbol=cnfg.garbage,
                method="post")

        test_src_batches, test_src_masks = torch_utils.splitBatches(
            train=test_src, batch_size=1, padSymbol=cnfg.garbage, method="pre")
        test_tgt_batches, test_tgt_masks = torch_utils.splitBatches(
            train=test_tgt,
            batch_size=1,
            padSymbol=cnfg.garbage,
            method="post")

        # Dump useless references
        train = None
        valid = None
        # Sanity check
        assert (len(train_tgt_batches) == len(train_src_batches))
        if cnfg.mode != "inference":
            assert (len(valid_tgt_batches) == len(valid_src_batches))
        assert (len(test_tgt_batches) == len(test_src_batches))

        print("Training Batches:", len(train_tgt_batches))
        if cnfg.mode != "inference":
            print("Validation Batches:", len(valid_tgt_batches))
        print("Test Points:", len(test_src_batches))

        if cnfg.cudnnBenchmark:
            torch.backends.cudnn.benchmark = True
        # Declare model object
        print("Declaring Model, Loss, Optimizer")
        model = LM(cnfg, wids_src=wids_src, wids_tgt=wids_tgt)
        loss_function = nn.NLLLoss(ignore_index=1, size_average=False)
        self.loss_function = loss_function
        if torch.cuda.is_available():
            model.cuda()
            loss_function = loss_function.cuda()
        optimizer = None
        if cnfg.optimizer_type == "SGD":
            optimizer = optim.SGD(model.getParams(), lr=0.05)
        elif cnfg.optimizer_type == "ADAM":
            optimizer = optim.Adam(model.getParams())

        if cnfg.mode == "trial":
            print("Running Sample Batch")
            print("Source Batch Shape:", train_src_batches[30].shape)
            print("Source Mask Shape:", train_src_masks[30].shape)
            print("Target Batch Shape:", train_tgt_batches[30].shape)
            print("Target Mask Shape:", train_tgt_masks[30].shape)
            sample_src_batch = train_src_batches[30]
            sample_tgt_batch = train_tgt_batches[30]
            sample_mask = train_tgt_masks[30]
            sample_src_mask = train_src_masks[30]
            print(datetime.datetime.now())
            model.zero_grad()
            loss = model.forward(sample_src_batch, sample_tgt_batch,
                                 sample_src_mask, sample_mask, loss_function)
            print(loss)
            loss.backward()
            optimizer.step()
            print(datetime.datetime.now())
            print("Done Running Sample Batch")

        train_batches = zip(train_src_batches, train_tgt_batches,
                            train_src_masks, train_tgt_masks)
        if cnfg.mode != "inference":
            valid_batches = zip(valid_src_batches, valid_tgt_batches,
                                valid_src_masks, valid_tgt_masks)

        train_src_batches, train_tgt_batches, train_src_masks, train_tgt_masks = None, None, None, None
        if cnfg.mode != "inference":
            valid_src_batches, valid_tgt_batches, valid_src_masks, valid_tgt_masks = None, None, None, None

        if cnfg.mode == "train" or cnfg.mode == "LM":
            print("Start Time:", datetime.datetime.now())
            for epochId in range(cnfg.NUM_EPOCHS):
                random.shuffle(train_batches)
                for batchId, batch in enumerate(train_batches):
                    src_batch, tgt_batch, src_mask, tgt_mask = batch[0], batch[
                        1], batch[2], batch[3]
                    batchLength = src_batch.shape[1]
                    batchSize = src_batch.shape[0]
                    tgtBatchLength = tgt_batch.shape[1]
                    if batchLength < cnfg.MAX_SEQ_LEN and batchSize > 1 and tgtBatchLength < cnfg.MAX_TGT_SEQ_LEN:
                        model.zero_grad()
                        loss = model.forward(tgt_batch, tgt_mask,
                                             loss_function)
                        if cnfg.mem_optimize:
                            del src_batch, tgt_batch, src_mask, tgt_mask
                        loss.backward()
                        if cnfg.mem_optimize:
                            del loss
                        optimizer.step()
                    if batchId % cnfg.PRINT_STEP == 0:
                        print("Batch No:", batchId, " Time:",
                              datetime.datetime.now())

                totalValidationLoss = 0.0
                NUM_TOKENS = 0.0
                for batchId, batch in enumerate(valid_batches):
                    src_batch, tgt_batch, src_mask, tgt_mask = batch[0], batch[
                        1], batch[2], batch[3]
                    model.zero_grad()
                    loss = model.forward(tgt_batch,
                                         tgt_mask,
                                         loss_function,
                                         inference=True)
                    if cnfg.normalizeLoss:
                        totalValidationLoss += (
                            loss.data.cpu().numpy()) * np.sum(tgt_mask)
                    else:
                        totalValidationLoss += (loss.data.cpu().numpy())
                    NUM_TOKENS += np.sum(tgt_mask)
                    if cnfg.mem_optimize:
                        del src_batch, tgt_batch, src_mask, tgt_mask, loss

                model.save_checkpoint(modelName + "_" + str(epochId),
                                      optimizer)

                perplexity = math.exp(totalValidationLoss / NUM_TOKENS)
                print("Epoch:", epochId, " Total Validation Loss:",
                      totalValidationLoss, " Perplexity:", perplexity)
            print("End Time:", datetime.datetime.now())

        elif cnfg.mode == "inference":
            model.load_from_checkpoint(modelName)
            # Evaluate on test first
            model.decodeAll(test_src_batches,
                            modelName,
                            method=cnfg.method,
                            evalMethod="BLEU",
                            suffix="test")
            # Also on valid
            valid_src = srcLangObj.read_corpus(mode="valid")
            valid_src_batches, valid_src_masks = torch_utils.splitBatches(
                train=valid_src,
                batch_size=1,
                padSymbol=cnfg.garbage,
                method="pre")
            model.decodeAll(valid_src_batches,
                            modelName,
                            method=cnfg.method,
                            evalMethod="BLEU",
                            suffix="valid")

        elif cnfg.mode == "saveLM":
            "Checking out LM"
            self.model = model
            model.load_from_checkpoint(modelName)
            loss = model.forward(test_tgt_batches[0],
                                 test_tgt_masks[0],
                                 loss_function,
                                 inference=True)
            print(loss.data.cpu().numpy())
            print(self.score(test_tgt[0]))
            print("Checked out LM")
            return self