def get_emission_matrix(training_file_name, max_identity_value): matrix = [1] * len(alphabet) for i in range(0, len(alphabet)): matrix[i] = [1] * len(observable) training = open(training_file_name, "rb").readlines() for i in range(0, len(training), 2): correct_tweet = TweetChecker.parse(training[i].replace("\n", "")) misspelled_tweet = parseObservation(training[i+1].replace("\n", "")) for i in range(0, len(correct_tweet)): raw = alphabet.index(correct_tweet[i]) col = observable.index(misspelled_tweet[i]) matrix[raw][col] = matrix[raw][col] + 1 for i in range(0, len(matrix)): den = sum(matrix[i]) k = i + 1 if max_identity_value != 0 and matrix[i][k] > den*max_identity_value: matrix[i][k] = den*max_identity_value matrix[i] = [float(j) / den for j in matrix[i]] return matrix
def createTrainingFile(tweetsFile, percentageOfError, file_name, without_underscore): with open(file_name, 'w+') as f: for line in tweetsFile: line = TweetChecker.parse(line) f.write(line.replace("\n", "") + "\n") wrongLine = introduceError(line.replace("\n", ""), percentageOfError) if without_underscore: wrongLine = wrongLine.replace("_", "") f.write(wrongLine + "\n") return file_name