コード例 #1
0
ファイル: HMMTrainer.py プロジェクト: uazadi/HOMIT
def get_emission_matrix(training_file_name, max_identity_value):

    matrix = [1] * len(alphabet)
    for i in range(0, len(alphabet)):
        matrix[i] = [1] * len(observable)

    training = open(training_file_name, "rb").readlines()

    for i in range(0, len(training), 2):
        correct_tweet = TweetChecker.parse(training[i].replace("\n", ""))
        misspelled_tweet = parseObservation(training[i+1].replace("\n", ""))

        for i in range(0, len(correct_tweet)):
            raw = alphabet.index(correct_tweet[i])
            col = observable.index(misspelled_tweet[i])
            matrix[raw][col] = matrix[raw][col] + 1


    for i in range(0, len(matrix)):
        den = sum(matrix[i])
        k = i + 1
        if max_identity_value != 0 and matrix[i][k] > den*max_identity_value:
            matrix[i][k] = den*max_identity_value
        matrix[i] = [float(j) / den for j in matrix[i]]

    return matrix
コード例 #2
0
ファイル: TrainingFileCreator.py プロジェクト: uazadi/HOMIT
def createTrainingFile(tweetsFile, percentageOfError, file_name, without_underscore):

    with open(file_name, 'w+') as f:
        for line in tweetsFile:
            line = TweetChecker.parse(line)
            f.write(line.replace("\n", "") + "\n")
            wrongLine = introduceError(line.replace("\n", ""), percentageOfError)
            if without_underscore:
                wrongLine = wrongLine.replace("_", "")
            f.write(wrongLine + "\n")

    return file_name