from Corpus import Corpus from Rule import PossibleRules print("Analysis starting...") train_corpus = Corpus(["./dataset/TrainingSet/file1.txt" , "./dataset/TrainingSet/file2.txt" , "./dataset/TrainingSet/file3.txt" , "./dataset/TrainingSet/file4.txt" , "./dataset/TrainingSet/file5.txt" , "./dataset/TrainingSet/file6.txt" , "./dataset/TrainingSet/file7.txt" , "./dataset/TrainingSet/file8.txt" ]) train_corpus.outputWords("./Output/MostLikelyMorphParseForWord.txt") print("Most likely morphological parses for words are written to ./Output/MostLikelyMorphParseForWord.txt") train_corpus.outputPOStags("./Output/MostLikelyTag.txt") print("Most likely tags are written to ./Output/MostLikelyTag.txt") train_corpus.tag_words_with_most_likely_parses() tag_order = 1 print("TRAIN: Precision for DS" + str(tag_order) + " " + str(train_corpus.calculate_precision())) print("Possible rules are generating...") rules = PossibleRules(train_corpus.tags[:20]).rules # just try first 20 words in the training corpus since it is expensive to walk through all the words print(str(len(train_corpus.all_words_in_corpus)) + " words in training set.") learned_rules_with_precision = [] for rule in rules:
from Corpus import Corpus from Rule import PossibleRules print("Analysis starting...") train_corpus = Corpus([ "./dataset/TrainingSet/file1.txt", "./dataset/TrainingSet/file2.txt", "./dataset/TrainingSet/file3.txt", "./dataset/TrainingSet/file4.txt", "./dataset/TrainingSet/file5.txt", "./dataset/TrainingSet/file6.txt", "./dataset/TrainingSet/file7.txt", "./dataset/TrainingSet/file8.txt" ]) train_corpus.outputWords("./Output/MostLikelyMorphParseForWord.txt") print( "Most likely morphological parses for words are written to ./Output/MostLikelyMorphParseForWord.txt" ) train_corpus.outputPOStags("./Output/MostLikelyTag.txt") print("Most likely tags are written to ./Output/MostLikelyTag.txt") train_corpus.tag_words_with_most_likely_parses() tag_order = 1 print("TRAIN: Precision for DS" + str(tag_order) + " " + str(train_corpus.calculate_precision())) print("Possible rules are generating...") rules = PossibleRules( train_corpus.tags[:20] ).rules # just try first 20 words in the training corpus since it is expensive to walk through all the words print(str(len(train_corpus.all_words_in_corpus)) + " words in training set.") learned_rules_with_precision = []