Beispiel #1
0
 def setUp(self):
     self.dataLoader = Loader()
     self.dataWithEvenInstances = [["Age", "Income", "class"],
                                   ["13", "1000", "yes"],
                                   ["18", "5000",
                                    "no"], ["15", "3000", "no"],
                                   ["14", "800", "yes"]]
     self.dataWithOddInstances = [["Age", "Income", "class"],
                                  ["13", "1000",
                                   "yes"], ["18", "5000", "no"],
                                  ["15", "3000", "no"]]
     self.dataWithMissingValues = [["Age", "Income", "class"],
                                   ["13", "", "yes"], ["18", "5000", ""],
                                   ["", "3000", "no"]]
Beispiel #2
0
def Test(opt):
    loader = Loader(opt)
    Model = model.setup(opt).cuda()
    label2id = load_label(opt.input_label2id)
    predictions, targets, attention_score, metrics = eval_utils.evaluate(
        Model, loader, label2id, opt.eval_batch_size, opt.rel_num, 'test')
    rel2id = json.load(open(opt.input_rel2id, 'r'))
    id2rel = {v: k for k, v in rel2id.items()}

    overlapping_test(predictions, targets)
    multiple_test(predictions, targets)
    '''
Beispiel #3
0
    def startProcess(self, labelWidget):
        """
        method to start process after all setters have been activated
        Attributes:
            labelWidget(tkinter.Label) : a message box for showing process to user
        """
        fileCreator, dataCleaner, dataDiscretization, Calculator, dataClassifier, dataLoader = CreateFile(), Cleaner(), Discretization(),\
                                                                                               MiningCalculator(), Classifier(), Loader()
        splitFunction = Calculator.getSplitFunc(self.classifierSplitType)

        try:
            labelWidget.configure(text=labelWidget.cget("text") +
                                  "Building process starting\n")

            dataLoader.loadData(self.folderPath)
            labelWidget.configure(text=labelWidget.cget("text") +
                                  "Data loading Finished\n")

            dataCleaner.cleanTrainingSet(dataLoader.trainingSet,
                                         dataLoader.structure)
            dataCleaner.cleanTestSet(dataLoader.testSet, dataLoader.structure)
            fileCreator.createCsvFile(dataLoader.structure,
                                      dataLoader.trainingSet,
                                      "Clean Training set",
                                      self.savingFolderPath)
            fileCreator.createCsvFile(dataLoader.structure, dataLoader.testSet,
                                      "Clean Test set", self.savingFolderPath)
            labelWidget.configure(text=labelWidget.cget("text") +
                                  "Data cleaning Finished\n")

            dataDiscretization.discretizationData(dataLoader.trainingSet,
                                                  dataLoader.testSet,
                                                  dataLoader.structure,
                                                  self.discretizationBins,
                                                  self.discretizationType)
            fileCreator.createCsvFile(dataLoader.structure,
                                      dataLoader.trainingSet,
                                      "Discretization Training set",
                                      self.savingFolderPath)
            fileCreator.createCsvFile(dataLoader.structure, dataLoader.testSet,
                                      "Discretization Test set",
                                      self.savingFolderPath)
            labelWidget.configure(text=labelWidget.cget("text") +
                                  "Data Discretization Finished\n")

            classifier = dataClassifier.buildClassifier(
                dataLoader.trainingSet, dataLoader.structure,
                self.classifierType, splitFunction)
            classifiedTestData = dataClassifier.classifyTest(
                dataLoader.testSet, dataLoader.structure, classifier)
            fileCreator.createCsvFile(dataLoader.structure, classifiedTestData,
                                      "Classified Test set",
                                      self.savingFolderPath)
            accuracy = dataClassifier.checkAccuracyOfClassifier(
                classifiedTestData, dataLoader.testSet)
            classifier += ["accuracy: " + str(accuracy)]
            fileCreator.createTxtFile(classifier, "Rules",
                                      self.savingFolderPath)
            labelWidget.configure(text=labelWidget.cget("text") +
                                  "Building classifier Finished\n")

            return labelWidget.configure(
                text=labelWidget.cget("text") +
                "Classifier build successfully with accuracy: " +
                str(round(accuracy, 3)) + "\n")

        except EnvironmentError:
            return labelWidget.configure(
                text=labelWidget.cget("text") +
                "Problem with file\\ file path. please check file is not empty and file path is correct!"
            )
        except:
            return labelWidget.configure(
                text=labelWidget.cget("text") +
                "An Error occurred please check file and inputs and start again!"
            )
Beispiel #4
0
def train(opt):
    loader = Loader(opt)
    infos = {}
    histories = {}

    Model = model.setup(opt).cuda()
    LW_model = LossWrapper(Model, opt)
    # DP_lw_model = torch.nn.DataParallel(LW_model)
    LW_model.train()
    optimizer = utils.build_optimizer(Model.parameters(), opt)

    if opt.start_from is not None:
        with open(os.path.join(opt.start_from, 'infos-best.pkl'), 'rb') as f:
            infos = utils.pickle_load(f)

        if os.path.isfile(os.path.join(opt.start_from, 'histories-best.pkl')):
            with open(os.path.join(opt.start_from, 'histories-best.pkl'), 'rb') as f:
                histories = utils.pickle_load(f)

        if os.path.isfile(os.path.join(opt.start_from, 'optimizer-best.pth')):
            optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer-best.pth')))
    else:
        infos['iter'] = 0
        infos['epoch'] = 0
        infos['opt'] = opt
        infos['label2id'] = load_label(opt.input_label2id)

    iteration = infos.get('iter', '0')
    epoch = infos.get('epoch', '0')
    best_val_score = infos.get('best_val_score', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    epoch_done = True
    best_epoch = -1
    try:
        while True:
            if epoch_done:
                iteration = 0
                if epoch != 0:
                    predictions, targets, _ ,metrics = eval_utils.evaluate(Model, loader, infos['label2id'], opt.eval_batch_size, opt.rel_num, 'dev')
                    val_result_history[iteration] = {'predictions': predictions, 'metrics': metrics, 'targets': targets}
                    #print('dev res: ', metrics)
                    current_score = metrics['F1']
                    histories['c'] = val_result_history
                    histories['loss_history'] = loss_history
                    histories['lr_history'] = lr_history

                    best_flag = False
                    if current_score > best_val_score:
                        best_epoch = epoch
                        best_val_score = current_score
                        best_flag = True
                    infos['best_val_score'] = best_val_score

                    save_checkpoint(Model, infos, optimizer, histories)
                    if best_flag:
                        save_checkpoint(Model, infos, optimizer, append='best')


                epoch_done = False
                if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                    frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                    decay_factor = opt.learning_rate_decay ** frac
                    opt.current_lr = opt.learning_rate * decay_factor
                else:
                    opt.current_lr = opt.learning_rate
                utils.set_lr(optimizer, opt.current_lr)
            start = time.time()
            data = loader.get_batch_train(opt.batch_size)
            #data = sorted(data, key=lambda x: x[-1], reverse=True)
            wrapped = data[-1]
            data = data[:-1]
            #print('Read data:', time.time() - start)

            torch.cuda.synchronize()
            start = time.time()
            data = [t.cuda() for t in data]
            sents, rels, labels, poses, chars, sen_lens = data
            if not opt.use_char:
                chars = None
            if not opt.use_pos:
                poses = None
            mask = torch.zeros(sents.size()).cuda()
            for i in range(sents.size(0)):
                mask[i][:sen_lens[i]] = 1

            mask2 = torch.where(labels == 8, torch.ones_like(sents), torch.ones_like(sents)*10).cuda()
            mask2 = mask2.float() * mask.float()

            optimizer.zero_grad()
            sum_loss = LW_model(sents, sen_lens, rels, mask, labels, mask2, poses, chars)

            loss = sum_loss/sents.shape[0]
            loss.backward()
            utils.clip_gradient(optimizer, opt.grad_clip)
            optimizer.step()
            train_loss = loss.item()
            torch.cuda.synchronize()
            if iteration % 200 == 0:
                end = time.time()
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, end - start))

            iteration += 1
            if wrapped:
                epoch += 1
                epoch_done = True
            infos['iter'] = iteration
            infos['epoch'] = epoch

            if iteration % opt.save_loss_every == 0:
                loss_history[iteration] = train_loss
                lr_history[iteration] = opt.current_lr
            if opt.max_epoch != -1 and epoch >= opt.max_epoch:
                break
    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        save_checkpoint(Model, infos, optimizer)
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)
Beispiel #5
0
class TestDataLoader(unittest.TestCase):
    dataLoader = None
    dataWithEvenInstances = []
    dataWithOddInstances = []
    dataWithMissingValues = []

    def setUp(self):
        self.dataLoader = Loader()
        self.dataWithEvenInstances = [["Age", "Income", "class"],
                                      ["13", "1000", "yes"],
                                      ["18", "5000",
                                       "no"], ["15", "3000", "no"],
                                      ["14", "800", "yes"]]
        self.dataWithOddInstances = [["Age", "Income", "class"],
                                     ["13", "1000",
                                      "yes"], ["18", "5000", "no"],
                                     ["15", "3000", "no"]]
        self.dataWithMissingValues = [["Age", "Income", "class"],
                                      ["13", "", "yes"], ["18", "5000", ""],
                                      ["", "3000", "no"]]

    def test_loadData(self):
        # need to read from file
        pass

    def test_buildStructure_dataWithEvenInstances(self):
        self.dataLoader.buildStructure(self.dataWithEvenInstances)

        self.assertEqual(
            {
                "Age": {
                    "index": 0,
                    "values": ["Numeric"]
                },
                "Income": {
                    "index": 1,
                    "values": ["Numeric"]
                },
                "class": {
                    "index": 2,
                    "values": ["yes", "no"]
                }
            }, self.dataLoader.structure)

    def test_buildStructure_dataWithOddInstances(self):
        self.dataLoader.buildStructure(self.dataWithOddInstances)

        self.assertEqual(
            {
                "Age": {
                    "index": 0,
                    "values": ["Numeric"]
                },
                "Income": {
                    "index": 1,
                    "values": ["Numeric"]
                },
                "class": {
                    "index": 2,
                    "values": ["yes", "no"]
                }
            }, self.dataLoader.structure)

    def test_buildStructure_dataWithMissingValues(self):
        self.dataLoader.buildStructure(self.dataWithMissingValues)

        self.assertEqual(
            {
                "Age": {
                    "index": 0,
                    "values": ["Numeric"]
                },
                "Income": {
                    "index": 1,
                    "values": ["Numeric"]
                },
                "class": {
                    "index": 2,
                    "values": ["yes", "no"]
                }
            }, self.dataLoader.structure)

    def test_getColumnsName_dataWithEvenInstances(self):
        self.assertEqual(
            {
                "Age": {
                    "index": 0
                },
                "Income": {
                    "index": 1
                },
                "class": {
                    "index": 2
                }
            }, self.dataLoader.getColumnsName(self.dataWithEvenInstances))

    def test_getColumnsName_dataWithOddInstances(self):
        self.assertEqual(
            {
                "Age": {
                    "index": 0
                },
                "Income": {
                    "index": 1
                },
                "class": {
                    "index": 2
                }
            }, self.dataLoader.getColumnsName(self.dataWithOddInstances))

    def test_getColumnsName_dataWithMissingValues(self):
        self.assertEqual(
            {
                "Age": {
                    "index": 0
                },
                "Income": {
                    "index": 1
                },
                "class": {
                    "index": 2
                }
            }, self.dataLoader.getColumnsName(self.dataWithMissingValues))

    def test_getColumnValues_dataWithEvenInstances(self):
        self.assertEqual(["Numeric"],
                         self.dataLoader.getColumnValues(
                             0, self.dataWithEvenInstances, 2))
        self.assertEqual(["Numeric"],
                         self.dataLoader.getColumnValues(
                             1, self.dataWithEvenInstances, 2))
        self.assertEqual(["yes", "no"],
                         self.dataLoader.getColumnValues(
                             2, self.dataWithEvenInstances, 2))

    def test_getColumnValues_dataWithOddInstances(self):
        self.assertEqual(["Numeric"],
                         self.dataLoader.getColumnValues(
                             0, self.dataWithOddInstances, 2))
        self.assertEqual(["Numeric"],
                         self.dataLoader.getColumnValues(
                             1, self.dataWithOddInstances, 2))
        self.assertEqual(["yes", "no"],
                         self.dataLoader.getColumnValues(
                             2, self.dataWithOddInstances, 2))

    def test_getColumnValues_dataWithMissingValues(self):
        self.assertEqual(["Numeric"],
                         self.dataLoader.getColumnValues(
                             0, self.dataWithMissingValues, 2))
        self.assertEqual(["Numeric"],
                         self.dataLoader.getColumnValues(
                             1, self.dataWithMissingValues, 2))
        self.assertEqual(["yes", "no"],
                         self.dataLoader.getColumnValues(
                             2, self.dataWithMissingValues, 2))

    def test_isNumeric_dataWithEvenInstances(self):
        self.assertEqual(
            True, self.dataLoader.isNumeric(0, self.dataWithEvenInstances))
        self.assertEqual(
            True, self.dataLoader.isNumeric(1, self.dataWithEvenInstances))
        self.assertEqual(
            False, self.dataLoader.isNumeric(2, self.dataWithEvenInstances))

    def test_isNumeric_dataWithOddInstances(self):
        self.assertEqual(
            True, self.dataLoader.isNumeric(0, self.dataWithOddInstances))
        self.assertEqual(
            True, self.dataLoader.isNumeric(1, self.dataWithOddInstances))
        self.assertEqual(
            False, self.dataLoader.isNumeric(2, self.dataWithOddInstances))

    def test_isNumeric_dataWithMissingValues(self):
        self.assertEqual(
            True, self.dataLoader.isNumeric(0, self.dataWithMissingValues))
        self.assertEqual(
            True, self.dataLoader.isNumeric(1, self.dataWithMissingValues))
        self.assertEqual(
            False, self.dataLoader.isNumeric(2, self.dataWithMissingValues))

    def test_buildTrainingSet_dataWithEvenInstances(self):
        self.dataLoader.buildDataSets(
            self.dataWithEvenInstances[1:],
            {'class': {
                'index': 2,
                'values': ['no', 'yes']
            }})

        self.assertEqual([["18", "5000", "no"], ["13", "1000", "yes"]],
                         self.dataLoader.trainingSet)
        self.assertEqual([["15", "3000", "no"], ["14", "800", "yes"]],
                         self.dataLoader.testSet)

    def test_buildTrainingSet_dataWithOddInstances(self):
        self.dataLoader.buildDataSets(
            self.dataWithOddInstances[1:],
            {'class': {
                'index': 2,
                'values': ['no', 'yes']
            }})

        self.assertEqual([["18", "5000", "no"], ["13", "1000", "yes"]],
                         self.dataLoader.trainingSet)
        self.assertEqual([["15", "3000", "no"]], self.dataLoader.testSet)

    def test_buildTrainingSet_dataWithMissingValues(self):
        self.dataLoader.buildDataSets(
            self.dataWithMissingValues[1:],
            {'class': {
                'index': 2,
                'values': ['no', 'yes']
            }})

        self.assertEqual(
            [["", "3000", "no"], ["13", "", "yes"], ["18", "5000", ""]],
            self.dataLoader.trainingSet)
        self.assertEqual([], self.dataLoader.testSet)
Beispiel #6
0
                    true_pos += 1
                elif p_ == 'O':
                    false_neg += 1
            elif p_ != y_:
                false_pos += 1
    prec = true_pos / (true_pos +
                       false_pos) if true_pos + false_pos != 0 else 0
    recall = true_pos / (true_pos +
                         false_neg) if true_pos + false_neg != 0 else 0
    f1 = 2 * prec * recall / (prec + recall) if prec + recall != 0 else 0
    return f1, prec, recall


if __name__ == '__main__':
    print("Load Data from file")
    loader = Loader()
    train_len, train_word, train_tag, train_char, train_orth, train_label = loader.load_data(
        'train')
    print("Train: ", train_word.shape[0])
    dev_len, dev_word, dev_tag, dev_char, dev_orth, dev_label = loader.load_data(
        'dev')
    print("Dev: ", dev_word.shape[0])
    test_len, test_word, test_tag, test_char, test_orth, test_label = loader.load_data(
        'test')
    print("Test: ", test_word.shape[0])

    EMBEDDING_DIM = 200
    HIDDEN_DIM = 100

    model = BiLSTM_CRF(len(loader.word_to_id), loader.label_to_id,
                       EMBEDDING_DIM, HIDDEN_DIM)