def setUp(self): self.dataLoader = Loader() self.dataWithEvenInstances = [["Age", "Income", "class"], ["13", "1000", "yes"], ["18", "5000", "no"], ["15", "3000", "no"], ["14", "800", "yes"]] self.dataWithOddInstances = [["Age", "Income", "class"], ["13", "1000", "yes"], ["18", "5000", "no"], ["15", "3000", "no"]] self.dataWithMissingValues = [["Age", "Income", "class"], ["13", "", "yes"], ["18", "5000", ""], ["", "3000", "no"]]
def Test(opt): loader = Loader(opt) Model = model.setup(opt).cuda() label2id = load_label(opt.input_label2id) predictions, targets, attention_score, metrics = eval_utils.evaluate( Model, loader, label2id, opt.eval_batch_size, opt.rel_num, 'test') rel2id = json.load(open(opt.input_rel2id, 'r')) id2rel = {v: k for k, v in rel2id.items()} overlapping_test(predictions, targets) multiple_test(predictions, targets) '''
def startProcess(self, labelWidget): """ method to start process after all setters have been activated Attributes: labelWidget(tkinter.Label) : a message box for showing process to user """ fileCreator, dataCleaner, dataDiscretization, Calculator, dataClassifier, dataLoader = CreateFile(), Cleaner(), Discretization(),\ MiningCalculator(), Classifier(), Loader() splitFunction = Calculator.getSplitFunc(self.classifierSplitType) try: labelWidget.configure(text=labelWidget.cget("text") + "Building process starting\n") dataLoader.loadData(self.folderPath) labelWidget.configure(text=labelWidget.cget("text") + "Data loading Finished\n") dataCleaner.cleanTrainingSet(dataLoader.trainingSet, dataLoader.structure) dataCleaner.cleanTestSet(dataLoader.testSet, dataLoader.structure) fileCreator.createCsvFile(dataLoader.structure, dataLoader.trainingSet, "Clean Training set", self.savingFolderPath) fileCreator.createCsvFile(dataLoader.structure, dataLoader.testSet, "Clean Test set", self.savingFolderPath) labelWidget.configure(text=labelWidget.cget("text") + "Data cleaning Finished\n") dataDiscretization.discretizationData(dataLoader.trainingSet, dataLoader.testSet, dataLoader.structure, self.discretizationBins, self.discretizationType) fileCreator.createCsvFile(dataLoader.structure, dataLoader.trainingSet, "Discretization Training set", self.savingFolderPath) fileCreator.createCsvFile(dataLoader.structure, dataLoader.testSet, "Discretization Test set", self.savingFolderPath) labelWidget.configure(text=labelWidget.cget("text") + "Data Discretization Finished\n") classifier = dataClassifier.buildClassifier( dataLoader.trainingSet, dataLoader.structure, self.classifierType, splitFunction) classifiedTestData = dataClassifier.classifyTest( dataLoader.testSet, dataLoader.structure, classifier) fileCreator.createCsvFile(dataLoader.structure, classifiedTestData, "Classified Test set", self.savingFolderPath) accuracy = dataClassifier.checkAccuracyOfClassifier( classifiedTestData, dataLoader.testSet) classifier += ["accuracy: " + str(accuracy)] fileCreator.createTxtFile(classifier, "Rules", self.savingFolderPath) labelWidget.configure(text=labelWidget.cget("text") + "Building classifier Finished\n") return labelWidget.configure( text=labelWidget.cget("text") + "Classifier build successfully with accuracy: " + str(round(accuracy, 3)) + "\n") except EnvironmentError: return labelWidget.configure( text=labelWidget.cget("text") + "Problem with file\\ file path. please check file is not empty and file path is correct!" ) except: return labelWidget.configure( text=labelWidget.cget("text") + "An Error occurred please check file and inputs and start again!" )
def train(opt): loader = Loader(opt) infos = {} histories = {} Model = model.setup(opt).cuda() LW_model = LossWrapper(Model, opt) # DP_lw_model = torch.nn.DataParallel(LW_model) LW_model.train() optimizer = utils.build_optimizer(Model.parameters(), opt) if opt.start_from is not None: with open(os.path.join(opt.start_from, 'infos-best.pkl'), 'rb') as f: infos = utils.pickle_load(f) if os.path.isfile(os.path.join(opt.start_from, 'histories-best.pkl')): with open(os.path.join(opt.start_from, 'histories-best.pkl'), 'rb') as f: histories = utils.pickle_load(f) if os.path.isfile(os.path.join(opt.start_from, 'optimizer-best.pth')): optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer-best.pth'))) else: infos['iter'] = 0 infos['epoch'] = 0 infos['opt'] = opt infos['label2id'] = load_label(opt.input_label2id) iteration = infos.get('iter', '0') epoch = infos.get('epoch', '0') best_val_score = infos.get('best_val_score', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) epoch_done = True best_epoch = -1 try: while True: if epoch_done: iteration = 0 if epoch != 0: predictions, targets, _ ,metrics = eval_utils.evaluate(Model, loader, infos['label2id'], opt.eval_batch_size, opt.rel_num, 'dev') val_result_history[iteration] = {'predictions': predictions, 'metrics': metrics, 'targets': targets} #print('dev res: ', metrics) current_score = metrics['F1'] histories['c'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history best_flag = False if current_score > best_val_score: best_epoch = epoch best_val_score = current_score best_flag = True infos['best_val_score'] = best_val_score save_checkpoint(Model, infos, optimizer, histories) if best_flag: save_checkpoint(Model, infos, optimizer, append='best') epoch_done = False if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay ** frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) start = time.time() data = loader.get_batch_train(opt.batch_size) #data = sorted(data, key=lambda x: x[-1], reverse=True) wrapped = data[-1] data = data[:-1] #print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() data = [t.cuda() for t in data] sents, rels, labels, poses, chars, sen_lens = data if not opt.use_char: chars = None if not opt.use_pos: poses = None mask = torch.zeros(sents.size()).cuda() for i in range(sents.size(0)): mask[i][:sen_lens[i]] = 1 mask2 = torch.where(labels == 8, torch.ones_like(sents), torch.ones_like(sents)*10).cuda() mask2 = mask2.float() * mask.float() optimizer.zero_grad() sum_loss = LW_model(sents, sen_lens, rels, mask, labels, mask2, poses, chars) loss = sum_loss/sents.shape[0] loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() if iteration % 200 == 0: end = time.time() print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) iteration += 1 if wrapped: epoch += 1 epoch_done = True infos['iter'] = iteration infos['epoch'] = epoch if iteration % opt.save_loss_every == 0: loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr if opt.max_epoch != -1 and epoch >= opt.max_epoch: break except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') save_checkpoint(Model, infos, optimizer) print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace)
class TestDataLoader(unittest.TestCase): dataLoader = None dataWithEvenInstances = [] dataWithOddInstances = [] dataWithMissingValues = [] def setUp(self): self.dataLoader = Loader() self.dataWithEvenInstances = [["Age", "Income", "class"], ["13", "1000", "yes"], ["18", "5000", "no"], ["15", "3000", "no"], ["14", "800", "yes"]] self.dataWithOddInstances = [["Age", "Income", "class"], ["13", "1000", "yes"], ["18", "5000", "no"], ["15", "3000", "no"]] self.dataWithMissingValues = [["Age", "Income", "class"], ["13", "", "yes"], ["18", "5000", ""], ["", "3000", "no"]] def test_loadData(self): # need to read from file pass def test_buildStructure_dataWithEvenInstances(self): self.dataLoader.buildStructure(self.dataWithEvenInstances) self.assertEqual( { "Age": { "index": 0, "values": ["Numeric"] }, "Income": { "index": 1, "values": ["Numeric"] }, "class": { "index": 2, "values": ["yes", "no"] } }, self.dataLoader.structure) def test_buildStructure_dataWithOddInstances(self): self.dataLoader.buildStructure(self.dataWithOddInstances) self.assertEqual( { "Age": { "index": 0, "values": ["Numeric"] }, "Income": { "index": 1, "values": ["Numeric"] }, "class": { "index": 2, "values": ["yes", "no"] } }, self.dataLoader.structure) def test_buildStructure_dataWithMissingValues(self): self.dataLoader.buildStructure(self.dataWithMissingValues) self.assertEqual( { "Age": { "index": 0, "values": ["Numeric"] }, "Income": { "index": 1, "values": ["Numeric"] }, "class": { "index": 2, "values": ["yes", "no"] } }, self.dataLoader.structure) def test_getColumnsName_dataWithEvenInstances(self): self.assertEqual( { "Age": { "index": 0 }, "Income": { "index": 1 }, "class": { "index": 2 } }, self.dataLoader.getColumnsName(self.dataWithEvenInstances)) def test_getColumnsName_dataWithOddInstances(self): self.assertEqual( { "Age": { "index": 0 }, "Income": { "index": 1 }, "class": { "index": 2 } }, self.dataLoader.getColumnsName(self.dataWithOddInstances)) def test_getColumnsName_dataWithMissingValues(self): self.assertEqual( { "Age": { "index": 0 }, "Income": { "index": 1 }, "class": { "index": 2 } }, self.dataLoader.getColumnsName(self.dataWithMissingValues)) def test_getColumnValues_dataWithEvenInstances(self): self.assertEqual(["Numeric"], self.dataLoader.getColumnValues( 0, self.dataWithEvenInstances, 2)) self.assertEqual(["Numeric"], self.dataLoader.getColumnValues( 1, self.dataWithEvenInstances, 2)) self.assertEqual(["yes", "no"], self.dataLoader.getColumnValues( 2, self.dataWithEvenInstances, 2)) def test_getColumnValues_dataWithOddInstances(self): self.assertEqual(["Numeric"], self.dataLoader.getColumnValues( 0, self.dataWithOddInstances, 2)) self.assertEqual(["Numeric"], self.dataLoader.getColumnValues( 1, self.dataWithOddInstances, 2)) self.assertEqual(["yes", "no"], self.dataLoader.getColumnValues( 2, self.dataWithOddInstances, 2)) def test_getColumnValues_dataWithMissingValues(self): self.assertEqual(["Numeric"], self.dataLoader.getColumnValues( 0, self.dataWithMissingValues, 2)) self.assertEqual(["Numeric"], self.dataLoader.getColumnValues( 1, self.dataWithMissingValues, 2)) self.assertEqual(["yes", "no"], self.dataLoader.getColumnValues( 2, self.dataWithMissingValues, 2)) def test_isNumeric_dataWithEvenInstances(self): self.assertEqual( True, self.dataLoader.isNumeric(0, self.dataWithEvenInstances)) self.assertEqual( True, self.dataLoader.isNumeric(1, self.dataWithEvenInstances)) self.assertEqual( False, self.dataLoader.isNumeric(2, self.dataWithEvenInstances)) def test_isNumeric_dataWithOddInstances(self): self.assertEqual( True, self.dataLoader.isNumeric(0, self.dataWithOddInstances)) self.assertEqual( True, self.dataLoader.isNumeric(1, self.dataWithOddInstances)) self.assertEqual( False, self.dataLoader.isNumeric(2, self.dataWithOddInstances)) def test_isNumeric_dataWithMissingValues(self): self.assertEqual( True, self.dataLoader.isNumeric(0, self.dataWithMissingValues)) self.assertEqual( True, self.dataLoader.isNumeric(1, self.dataWithMissingValues)) self.assertEqual( False, self.dataLoader.isNumeric(2, self.dataWithMissingValues)) def test_buildTrainingSet_dataWithEvenInstances(self): self.dataLoader.buildDataSets( self.dataWithEvenInstances[1:], {'class': { 'index': 2, 'values': ['no', 'yes'] }}) self.assertEqual([["18", "5000", "no"], ["13", "1000", "yes"]], self.dataLoader.trainingSet) self.assertEqual([["15", "3000", "no"], ["14", "800", "yes"]], self.dataLoader.testSet) def test_buildTrainingSet_dataWithOddInstances(self): self.dataLoader.buildDataSets( self.dataWithOddInstances[1:], {'class': { 'index': 2, 'values': ['no', 'yes'] }}) self.assertEqual([["18", "5000", "no"], ["13", "1000", "yes"]], self.dataLoader.trainingSet) self.assertEqual([["15", "3000", "no"]], self.dataLoader.testSet) def test_buildTrainingSet_dataWithMissingValues(self): self.dataLoader.buildDataSets( self.dataWithMissingValues[1:], {'class': { 'index': 2, 'values': ['no', 'yes'] }}) self.assertEqual( [["", "3000", "no"], ["13", "", "yes"], ["18", "5000", ""]], self.dataLoader.trainingSet) self.assertEqual([], self.dataLoader.testSet)
true_pos += 1 elif p_ == 'O': false_neg += 1 elif p_ != y_: false_pos += 1 prec = true_pos / (true_pos + false_pos) if true_pos + false_pos != 0 else 0 recall = true_pos / (true_pos + false_neg) if true_pos + false_neg != 0 else 0 f1 = 2 * prec * recall / (prec + recall) if prec + recall != 0 else 0 return f1, prec, recall if __name__ == '__main__': print("Load Data from file") loader = Loader() train_len, train_word, train_tag, train_char, train_orth, train_label = loader.load_data( 'train') print("Train: ", train_word.shape[0]) dev_len, dev_word, dev_tag, dev_char, dev_orth, dev_label = loader.load_data( 'dev') print("Dev: ", dev_word.shape[0]) test_len, test_word, test_tag, test_char, test_orth, test_label = loader.load_data( 'test') print("Test: ", test_word.shape[0]) EMBEDDING_DIM = 200 HIDDEN_DIM = 100 model = BiLSTM_CRF(len(loader.word_to_id), loader.label_to_id, EMBEDDING_DIM, HIDDEN_DIM)