def main(): print("Starting app...") SEED = 56427 random.seed(SEED) data = DataLoader() data.load_data() # Create baseline model predict, print accuracy print('\n Baseline Model') print('---------------------------------------------------') most_freq_tag_baseline = MostFrequentTag(data.train_data) most_freq_tag_baseline.calculate_the_most_frequent_tag_per_word() predicted_labels, given_labels = most_freq_tag_baseline.predict( data.test_data) eval = Evaluation() baseline_accuracy = eval.computeSeqAccuracy(predicted_labels, given_labels) print('The accuracy of Most Frequent Tag baseline is: ' + str(round(baseline_accuracy, 2)) + '%') baseline_labels = [] for sentence_tags in given_labels: for tag in sentence_tags: if tag not in baseline_labels: baseline_labels.append(tag) baseline_f1_score_train = metrics.flat_classification_report( given_labels, predicted_labels, labels=baseline_labels) print(baseline_f1_score_train) # Create Hmm model, train, predict, print accuracy hmm_tagger = HMMTagger() hmm_tagger.train_tagger(data.train_data) dev_tags, dev_pred = hmm_tagger.predict(data.dev_data) hmmeval = HmmEvaluation(data.train_data, data.dev_data) hmmeval.calculate_overall_accuracy_and_f1_score_per_pos() hmmeval.plot_learning_curves( 'The learning curves for hmm pos tagger model') hmmeval.plot_f1_curves() hmm_accuracy = hmmeval.computeSeqAccuracy(dev_pred, dev_tags) print('The accuracy of hmm Pos Tagger is: ' + str(round(hmm_accuracy, 2) * 100) + '%') # Create crf model, train, predict, print accuracy print('\n Crf Pos Tagger') print('---------------------------------------------------') print( 'Create the features for crf pos tagger model for train, dev and test set.\n' ) crf_pos = CrfPosTagger() trainSeqFeatures, trainSeqLabels = crf_pos.transformDatasetSequence( data.train_data) devSeqFeatures, devSeqLabels = crf_pos.transformDatasetSequence( data.dev_data) testSeqFeatures, testSeqLabels = crf_pos.transformDatasetSequence( data.test_data) print( 'Tune the hyper-parameters c1 and c2 of crf model on a held-out part of the training data, the dev set.\n' ) crf_hyperparameters = tune_the_hyperparameters_on_held_out_data( trainSeqFeatures, trainSeqLabels, devSeqFeatures, devSeqLabels) print('The best value for c1 is ' + str(crf_hyperparameters[0]) + ' anc c2 = ' + str(crf_hyperparameters[1]) + '. \n') print('Train crf using the best hyperparameters. \n') crf_model = crf_pos.trainCRF(trainSeqFeatures, trainSeqLabels, crf_hyperparameters) crfeval = CrfEvaluation(trainSeqFeatures, trainSeqLabels, devSeqFeatures, devSeqLabels) crfeval.calculate_overall_accuracy_and_f1_score_per_pos( crf_hyperparameters) crfeval.plot_learning_curves( 'The learning curves for crf pos tagger model') crfeval.plot_f1_curves() print('Predict the labels on test set. \n') pred_labels = crf_model.predict(testSeqFeatures) crf_accuracy = crfeval.computeSeqAccuracy(pred_labels, testSeqLabels) print('The accuracy of crf Pos Tagger is: ' + str(round(crf_accuracy, 2) * 100) + '%') labels = list(crf_model.classes_) f1_score_train = metrics.flat_classification_report(testSeqLabels, pred_labels, labels=labels) print(f1_score_train)