예제 #1
0
def unigram(train_sentences, test_sentences):
    #  processing of the sentences to tagged samples, since there's no importance to sentences structure
    train = sentences_to_samples(train_sentences)
    test = sentences_to_samples(test_sentences)

    unigram_HMM = Unigram(train)
    unigram_HMM.train()

    #  initialisation of lists of samples containing known and unknown words
    test_known_words, test_unknown_words = divide_test_to_known_and_unknown_samples(
        train_sentences, test_sentences)

    #  evaluation of the accuracy for each case
    print("Accuracy rate for unknown words: ",
          unigram_HMM.get_accuracy_rate(np.array(test_unknown_words)))
    print("Accuracy rate for known words: ",
          unigram_HMM.get_accuracy_rate(np.array(test_known_words)))
    print("Total accuracy rate: ",
          unigram_HMM.get_accuracy_rate(np.array(test)))
예제 #2
0
class Author(object):

    __name = ""
    __unigram = Unigram()
    __bigram = Bigram()
    __trigram = Trigram()

    # Constructor.
    def __init__(self, name):
        self.__name = name
        self.__unigram = Unigram()
        self.__bigram = Bigram()
        self.__trigram = Trigram()

    # Getters.
    def getUnigram(self):
        return self.__unigram

    def getBigram(self):
        return self.__bigram

    def getTrigram(self):
        return self.__trigram

    def getName(self):
        return self.__name

    # Caller method, it is used for counting frequency in the unigram, bigram and trigram.
    def counterCaller(self, separated_line):
        self.__unigram.counter(separated_line)
        self.__bigram.counter(separated_line)
        self.__trigram.counter(separated_line)

    # Caller method, it is used for generating new text with respect to unigram, bigram and trigram.
    def generatorCaller(self, uni_list, bi_list, tri_list):
        self.__unigram.generator(uni_list)
        self.__bigram.generator(bi_list)
        self.__trigram.generator(tri_list)
예제 #3
0
print(os.cpu_count())
Config.num_threads = os.cpu_count()

Config.epsilon = args.epsilon
Config.learning_rate = args.lr
Config.lamb = args.lamb
Config.t = args.t

data = IOModule()
data_set = data.read_file(Config.train_data)
valid_set = data.read_file(Config.validate_data)
test_set = data.read_file(Config.test_data)

if args.model == 'unigram':
    model = Unigram(data_set, valid_set, test_set)
elif args.model == 'ngram':
    model = BiTrigram(data_set, valid_set, test_set)
elif args.model == 'custom':
    model = CustomModel(data_set, valid_set, test_set)
elif args.model == 'best':
    model1 = BiTrigram(data_set, valid_set, test_set)
    model2 = CustomModel(data_set, valid_set, test_set)
    model = Model(data_set, valid_set, test_set)
    model.combine_features_from_models(model1, model2)

model.generate_input_matrix()

model.gradient_ascent()

model.plot_output(args.model)
예제 #4
0
 def __init__(self, name):
     self.__name = name
     self.__unigram = Unigram()
     self.__bigram = Bigram()
     self.__trigram = Trigram()
예제 #5
0
i = 0
delete = False
while i < len(origWords):
    if(origWords[i].startswith("targ=")):
        while(origWords[i] != "</ERR>"):
            del origWords[i]
        del origWords[i]
        i += 1
    else:
        i += 1




### Let's form our bigram.
unigram = Unigram(correctWords)
bigram = Bigram(correctWords, unigram)
lettersMap = {}

lettersMap[EditDistance.WORDBOUNDARY] = len(correctWords)
for word in correctWords:
    for i in range(len(word)):
        lettersMap[word[i]] = lettersMap.get(word[i] , 0) + 1
    lettersMap[(EditDistance.WORDBOUNDARY + word[0])] = lettersMap.get((EditDistance.WORDBOUNDARY + word[0]), 0) + 1
    for i in range(len(word) - 1):
        lettersMap[(word[i] + word[i + 1])] = lettersMap.get((word[i] + word[i + 1]) , 0) + 1

# This is for creating the edit distances. They return a hashmap with a tuple like this: ('ins', 'a', 'ab') -> 22
changeMap = {}
wrongWordsSet = set([origWords[i] for i in correctionIndexes])
start = time.time()