def unigram(train_sentences, test_sentences): # processing of the sentences to tagged samples, since there's no importance to sentences structure train = sentences_to_samples(train_sentences) test = sentences_to_samples(test_sentences) unigram_HMM = Unigram(train) unigram_HMM.train() # initialisation of lists of samples containing known and unknown words test_known_words, test_unknown_words = divide_test_to_known_and_unknown_samples( train_sentences, test_sentences) # evaluation of the accuracy for each case print("Accuracy rate for unknown words: ", unigram_HMM.get_accuracy_rate(np.array(test_unknown_words))) print("Accuracy rate for known words: ", unigram_HMM.get_accuracy_rate(np.array(test_known_words))) print("Total accuracy rate: ", unigram_HMM.get_accuracy_rate(np.array(test)))
class Author(object): __name = "" __unigram = Unigram() __bigram = Bigram() __trigram = Trigram() # Constructor. def __init__(self, name): self.__name = name self.__unigram = Unigram() self.__bigram = Bigram() self.__trigram = Trigram() # Getters. def getUnigram(self): return self.__unigram def getBigram(self): return self.__bigram def getTrigram(self): return self.__trigram def getName(self): return self.__name # Caller method, it is used for counting frequency in the unigram, bigram and trigram. def counterCaller(self, separated_line): self.__unigram.counter(separated_line) self.__bigram.counter(separated_line) self.__trigram.counter(separated_line) # Caller method, it is used for generating new text with respect to unigram, bigram and trigram. def generatorCaller(self, uni_list, bi_list, tri_list): self.__unigram.generator(uni_list) self.__bigram.generator(bi_list) self.__trigram.generator(tri_list)
print(os.cpu_count()) Config.num_threads = os.cpu_count() Config.epsilon = args.epsilon Config.learning_rate = args.lr Config.lamb = args.lamb Config.t = args.t data = IOModule() data_set = data.read_file(Config.train_data) valid_set = data.read_file(Config.validate_data) test_set = data.read_file(Config.test_data) if args.model == 'unigram': model = Unigram(data_set, valid_set, test_set) elif args.model == 'ngram': model = BiTrigram(data_set, valid_set, test_set) elif args.model == 'custom': model = CustomModel(data_set, valid_set, test_set) elif args.model == 'best': model1 = BiTrigram(data_set, valid_set, test_set) model2 = CustomModel(data_set, valid_set, test_set) model = Model(data_set, valid_set, test_set) model.combine_features_from_models(model1, model2) model.generate_input_matrix() model.gradient_ascent() model.plot_output(args.model)
def __init__(self, name): self.__name = name self.__unigram = Unigram() self.__bigram = Bigram() self.__trigram = Trigram()
i = 0 delete = False while i < len(origWords): if(origWords[i].startswith("targ=")): while(origWords[i] != "</ERR>"): del origWords[i] del origWords[i] i += 1 else: i += 1 ### Let's form our bigram. unigram = Unigram(correctWords) bigram = Bigram(correctWords, unigram) lettersMap = {} lettersMap[EditDistance.WORDBOUNDARY] = len(correctWords) for word in correctWords: for i in range(len(word)): lettersMap[word[i]] = lettersMap.get(word[i] , 0) + 1 lettersMap[(EditDistance.WORDBOUNDARY + word[0])] = lettersMap.get((EditDistance.WORDBOUNDARY + word[0]), 0) + 1 for i in range(len(word) - 1): lettersMap[(word[i] + word[i + 1])] = lettersMap.get((word[i] + word[i + 1]) , 0) + 1 # This is for creating the edit distances. They return a hashmap with a tuple like this: ('ins', 'a', 'ab') -> 22 changeMap = {} wrongWordsSet = set([origWords[i] for i in correctionIndexes]) start = time.time()