def main(args): logging.basicConfig(level=LOGGING_LEVEL, format="DEBUG: %(message)s") if len(args) < 3 or len(args) > 4: print 'usage: %s training-file dev-file [output-dir]' % args[0] print ' output-dir is optional, default is "%s"' % OUTPUT_DIR_DEFAULT sys.exit(1) training_filename = args[1] dev_filename = args[2] output_dir = args[3] if len(args) == 4 else OUTPUT_DIR_DEFAULT logging.debug('Training models...') # train all the models! unigram_model = Unigram(training_filename) logging.debug('Done training unigram model') bigram_model = Bigram(training_filename) logging.debug('Done training bigram model') trigram_model = Trigram(training_filename) logging.debug('Done training trigram model') dev_words = [line.strip() for line in open(dev_filename, 'r')] # write predictions out to disk unigram_model.write_probability_list(dev_words, get_output_filename(output_dir, dev_filename, 'unigram')) logging.debug('Wrote dev set predictions using unigram model') bigram_model.write_probability_list(dev_words, get_output_filename(output_dir, dev_filename, 'bigram')) logging.debug('Wrote dev set predictions using bigram model') trigram_model.write_probability_list(dev_words, get_output_filename(output_dir, dev_filename, 'trigram')) logging.debug('Wrote dev set predictions using trigram model')
def find_unigrams(self): print "finding unigrams" print "inputpath",self.input_path u=Unigram() u.set_input_path(self.input_path) u.set_output_path(self.unigram_output_path) u.find_unigram() self.no_of_words=u.no_of_unigrams print "self.no_of_words",u.get_no_of_unigrams() self.words=u.ranked_list print "self.words",len(self.words)
class Author(object): __name = "" __unigram = Unigram() __bigram = Bigram() __trigram = Trigram() # Constructor. def __init__(self, name): self.__name = name self.__unigram = Unigram() self.__bigram = Bigram() self.__trigram = Trigram() # Getters. def getUnigram(self): return self.__unigram def getBigram(self): return self.__bigram def getTrigram(self): return self.__trigram def getName(self): return self.__name # Caller method, it is used for counting frequency in the unigram, bigram and trigram. def counterCaller(self, separated_line): self.__unigram.counter(separated_line) self.__bigram.counter(separated_line) self.__trigram.counter(separated_line) # Caller method, it is used for generating new text with respect to unigram, bigram and trigram. def generatorCaller(self, uni_list, bi_list, tri_list): self.__unigram.generator(uni_list) self.__bigram.generator(bi_list) self.__trigram.generator(tri_list)
def unigram(train_sentences, test_sentences): # processing of the sentences to tagged samples, since there's no importance to sentences structure train = sentences_to_samples(train_sentences) test = sentences_to_samples(test_sentences) unigram_HMM = Unigram(train) unigram_HMM.train() # initialisation of lists of samples containing known and unknown words test_known_words, test_unknown_words = divide_test_to_known_and_unknown_samples( train_sentences, test_sentences) # evaluation of the accuracy for each case print("Accuracy rate for unknown words: ", unigram_HMM.get_accuracy_rate(np.array(test_unknown_words))) print("Accuracy rate for known words: ", unigram_HMM.get_accuracy_rate(np.array(test_known_words))) print("Total accuracy rate: ", unigram_HMM.get_accuracy_rate(np.array(test)))
def __init__(self, name): self.__name = name self.__unigram = Unigram() self.__bigram = Bigram() self.__trigram = Trigram()
print(os.cpu_count()) Config.num_threads = os.cpu_count() Config.epsilon = args.epsilon Config.learning_rate = args.lr Config.lamb = args.lamb Config.t = args.t data = IOModule() data_set = data.read_file(Config.train_data) valid_set = data.read_file(Config.validate_data) test_set = data.read_file(Config.test_data) if args.model == 'unigram': model = Unigram(data_set, valid_set, test_set) elif args.model == 'ngram': model = BiTrigram(data_set, valid_set, test_set) elif args.model == 'custom': model = CustomModel(data_set, valid_set, test_set) elif args.model == 'best': model1 = BiTrigram(data_set, valid_set, test_set) model2 = CustomModel(data_set, valid_set, test_set) model = Model(data_set, valid_set, test_set) model.combine_features_from_models(model1, model2) model.generate_input_matrix() model.gradient_ascent() model.plot_output(args.model)
i = 0 delete = False while i < len(origWords): if(origWords[i].startswith("targ=")): while(origWords[i] != "</ERR>"): del origWords[i] del origWords[i] i += 1 else: i += 1 ### Let's form our bigram. unigram = Unigram(correctWords) bigram = Bigram(correctWords, unigram) lettersMap = {} lettersMap[EditDistance.WORDBOUNDARY] = len(correctWords) for word in correctWords: for i in range(len(word)): lettersMap[word[i]] = lettersMap.get(word[i] , 0) + 1 lettersMap[(EditDistance.WORDBOUNDARY + word[0])] = lettersMap.get((EditDistance.WORDBOUNDARY + word[0]), 0) + 1 for i in range(len(word) - 1): lettersMap[(word[i] + word[i + 1])] = lettersMap.get((word[i] + word[i + 1]) , 0) + 1 # This is for creating the edit distances. They return a hashmap with a tuple like this: ('ins', 'a', 'ab') -> 22 changeMap = {} wrongWordsSet = set([origWords[i] for i in correctionIndexes]) start = time.time()