def calculate_posterior(sentence, label):
    posterior_prob = 1.0
    for index in range(0, len(sentence)):

        posterior_prob *= Probabilities.get_posterior_word_probability(sentence[index], label[index])

        if index == 0:
            posterior_prob *= Probabilities.get_first_speech_prob(label[index])
        else:
            posterior_prob *= Probabilities.get_transition_prob(label[index], label[index - 1])

    return log(posterior_prob)
def calculate_posterior(sentence, label):
    posterior_prob = 1.0
    for index in range(0, len(sentence)):

        posterior_prob *= Probabilities.get_posterior_word_probability(
            sentence[index], label[index])

        if index == 0:
            posterior_prob *= Probabilities.get_first_speech_prob(label[index])
        else:
            posterior_prob *= Probabilities.get_transition_prob(
                label[index], label[index - 1])

    return log(posterior_prob)
Example #3
0
def Qc(train_set, test_set, laplace=False):
    """Handles the tasks of question c

    Arguments:
        train_set
        test_set 

    Keyword Arguments:
        laplace {bool} -- are we to use Laplace smoothing or not (default: {False})
    """
    gen_error_vec = []
    known_error_vec = []
    unknown_error_vec = []

    viterbi_results = []
    train_set = clean_POS(train_set)
    test_set = clean_POS(test_set)
    S = initialize_S(train_set)
    probs = Probabilities(S, train_set=train_set, test_set=test_set)
    for xy_tup in test_set:
        x = [t[0] for t in xy_tup]
        y = [t[1] for t in xy_tup]
        viterbi_tags = viterbi(x, probs, laplace)
        viterbi_results.append(viterbi_tags)
        err_vec, known_0, unknonwn_0 = (calculate_error(
            viterbi_tags, y, x, probs))
        gen_error_vec.append(err_vec[0])
        if not known_0: known_error_vec.append(err_vec[1])
        if not unknonwn_0: unknown_error_vec.append(err_vec[2])
    gen_error = statistics.mean(gen_error_vec)
    known_error = statistics.mean(known_error_vec)
    unknown_error = statistics.mean(unknown_error_vec)
    return [gen_error, known_error, unknown_error]
def get_part_of_speech(sentence):
    pos = []
    for word in sentence:
        best_prob = 0
        best_speech = ""
        for speech in Probabilities.speech_prob.keys():
            word_prob = Probabilities.get_naive_word_probability(word, speech) * Probabilities.speech_prob[speech]

            # choose best possible tag for speech
            if word_prob > best_prob:
                best_prob = word_prob
                best_speech = speech

        # choose best speech by heuristic if no occurrences found
        if best_prob == 0:
            best_speech = Probabilities.get_best_possible_speech(word)

        pos.append(best_speech)

        # store result in result cache for future use
        result_cache.naive_result = pos

    return [[pos], []]
def get_part_of_speech(sentence):
    pos = []
    for word in sentence:
        best_prob = 0
        best_speech = ""
        for speech in Probabilities.speech_prob.keys():
            word_prob = Probabilities.get_naive_word_probability(
                word, speech) * Probabilities.speech_prob[speech]

            # choose best possible tag for speech
            if word_prob > best_prob:
                best_prob = word_prob
                best_speech = speech

        # choose best speech by heuristic if no occurrences found
        if best_prob == 0:
            best_speech = Probabilities.get_best_possible_speech(word)

        pos.append(best_speech)

        # store result in result cache for future use
        result_cache.naive_result = pos

    return [[pos], []]
Example #6
0
def Qe(train_set, test_set, laplace=False):
    """Handles tasks of question e

    Arguments:
        train_set 
        test_set 

    Keyword Arguments:
        laplace {bool} -- (default: {False})
    """
    # initializations
    viterbi_results = []
    gen_error_vec = []
    known_error_vec = []
    unknown_error_vec = []

    # "clean" the train and test sets from complex tags
    train_set = clean_POS(train_set)
    test_set = clean_POS(test_set)

    S = initialize_S(train_set)
    probs = Probabilities(S, train_set, test_set)
    # Generate pseudo train and test sets and probability object
    pseudo_train = probs.generate_pseudo_set(train_set)
    pseudo_test = probs.generate_pseudo_set(test_set)
    pseudo_probs = Probabilities(S, pseudo_train, pseudo_test)
    for xy_tup in pseudo_test:
        x = [t[0] for t in xy_tup]
        y = [t[1] for t in xy_tup]
        viterbi_tags = viterbi(x, pseudo_probs, laplace)
        viterbi_results.append(viterbi_tags)
        err_vec, _, _ = (calculate_error(viterbi_tags, y, x, probs, True))
        gen_error_vec.append(err_vec[0])
        # update confusion values
        pseudo_probs.update_confusion_matrix(y, viterbi_tags)
    gen_error = statistics.mean(gen_error_vec)
    print(gen_error)
    # print results and statistics
    if laplace:
        print(DataFrame(confusion_matrix(S, pseudo_probs)))
    return gen_error
def get_part_of_speech(sentense):
    # all the speeches from the train data
    speeches = Probabilities.speech_prob.keys()
    no_words = len(sentense)
    no_speech = len(speeches)

    # Holds backtrack path for trace back
    back_tracks = [[0] * (no_words + 1) for x in range(no_speech)]

    # maximum probabilities for each speech at each level
    max_probabilities = [[0] * (no_words + 1) for x in range(no_speech)]

    # initial probability calculation for first word
    first_word = sentense[0]

    # basic step
    for i in range(0, no_speech):
        max_probabilities[i][0] = Probabilities.get_first_speech_prob(
            speeches[i]) * Probabilities.get_word_probability(
                first_word, speeches[i])
        back_tracks[i][0] = ""

    # recursive step
    for word_index in range(1, no_words):
        curr_word = sentense[word_index]
        for tag_index in range(0, no_speech):
            arg_max = -sys.maxint
            arg_bt = ""
            max_total_prob = -sys.maxint
            curr_tag = speeches[tag_index]
            word_prob = Probabilities.get_word_probability(curr_word, curr_tag)
            for prev_tag_index in range(0, no_speech):
                prev_tag = speeches[prev_tag_index]

                # calculate transition probability
                transition_prob = Probabilities.get_transition_prob(
                    curr_tag,
                    prev_tag) * max_probabilities[prev_tag_index][word_index -
                                                                  1]

                if transition_prob > arg_max:
                    arg_max = transition_prob
                    arg_bt = prev_tag

                # total probability
                total_prob = transition_prob * word_prob

                if total_prob > max_total_prob:
                    max_total_prob = total_prob

            back_tracks[tag_index][word_index] = arg_bt
            max_probabilities[tag_index][word_index] = max_total_prob

    # terminal step, calculate speech for last word
    max_probabilities[no_speech - 1][no_words] = -1

    for i in range(0, no_speech):
        tag = speeches[i]
        last_prob = max_probabilities[i][
            no_words - 1] * Probabilities.get_last_speech_prob(tag)

        if max_probabilities[no_speech - 1][no_words] < last_prob:
            max_probabilities[no_speech - 1][no_words] = last_prob
            back_tracks[no_speech - 1][no_words] = tag

    # backtrack, get best path
    last_tag = back_tracks[no_speech - 1][no_words]

    solution = [last_tag]

    for word_index in range(no_words - 1, 0, -1):
        prev_tag_index = speeches.index(last_tag)
        last_tag = back_tracks[prev_tag_index][word_index]
        solution.append(last_tag)

    # Due to backtrack it will be in reverse, change it
    solution.reverse()

    # store result in result cache for future use
    result_cache.viterbi_result = solution[:]

    return [[solution], []]
def get_part_of_speech(sentence):
    # print result_cache.results
    result_cache.results = [result_cache.naive_result] + [result_cache.max_marginal] + [result_cache.viterbi_result]
    Probabilities.convert_algo_results(result_cache.results, sentence)
    pos = []
    previous_word = ""
    for index in range(len(sentence)):
        best_prob = - sys.maxint

        for speech in Probabilities.speech_prob.keys():
            word = sentence[index]
            prob = Probabilities.get_word_probability(word, speech)
            if index == 0:
                next_word = sentence[index + 1]
                prob *= Probabilities.get_next_word_speech_probability(next_word,
                                                                       speech) * Probabilities.get_first_speech_prob(
                    speech)
            elif len(sentence) == 1:
                prob *= Probabilities.get_first_speech_prob(speech)
            elif index == len(sentence) - 1:
                prob *= Probabilities.get_transition_prob(speech,
                                                          previous_speech) * Probabilities.get_prev_word_speech_probability(
                    previous_word, speech)
            else:
                next_word = sentence[index + 1]

                prob *= Probabilities.get_transition_prob(speech,
                                                          previous_speech) * Probabilities.get_prev_word_speech_probability(
                    previous_word, speech) * Probabilities.get_next_word_speech_probability(next_word, speech)

            previous_speech = speech
            previous_word = word

            if best_prob < prob:
                best_prob = prob
                best_speech = speech

        pos.insert(index, best_speech)

    return pos
def get_samples(sentence, sample_count):
    ''' Gibbs sampler
    1. Generate initial samples, i.e. Assign the sentence random Parts of Speech [uniform or random or EM, not sure]
    1.1 Optional Burn-in or thinning?, throw away few samples. [TO-DO]
    Repeat sample_count times
        2. Pick the last sample, x[t] <- x[t-1]
        3. Repeat the following steps for all unobserved/non-evidence words
        4. For each word picked, sample it by calculating the posterior probability keeping all other variable as evidence
    5. Add to sample list
    '''
    sampleMap = Counter()
    samples = []
    # step 1 - generate initial samples
    initSample = generateInitSamples(sentence)
    previousSample = initSample

    for i in range(1, Constants.gibbs_max_iteration):
        # step 2 - pick last sample
        modifiedSample = previousSample
        # step 3
        for j in range(0, len(sentence)):
            speechTags = []
            probWeights = []
            sumProbWeights = 0
            # step 4, calculate the posterior probability
            for speech in Probabilities.speech_prob.keys():
                word_prob = Probabilities.get_word_probability(sentence[j], speech)

                if len(sentence) == 1:
                    prob1 = Probabilities.get_first_speech_prob(speech)
                    prob = word_prob * prob1
                elif j == 0:  # first word, nothing prior
                    prob1 = Probabilities.get_first_speech_prob(speech) * Probabilities.get_transition_prob(
                        modifiedSample[j + 1], speech)
                    prob = word_prob * prob1
                elif j == len(sentence) - 1:
                    prob1 = Probabilities.get_transition_prob(speech, modifiedSample[j - 1])
                    prob = word_prob * prob1
                else:
                    prob1 = Probabilities.get_transition_prob(speech, modifiedSample[
                        j - 1]) * Probabilities.get_transition_prob(modifiedSample[j + 1], speech)
                    prob = word_prob * prob1
                sumProbWeights += prob
                speechTags.append(speech)
                probWeights.append(prob)

            probWeights = [x / sumProbWeights for x in probWeights]

            # cummulative sum
            cumsum = 0
            randomWeight = random()
            for i in range(len(probWeights)):
                cumsum += probWeights[i]
                probWeights[i] = cumsum

            randomIndex = -1
            for i in range(1, len(probWeights)):
                if probWeights[i] >= randomWeight and randomWeight >= probWeights[i - 1]:
                    randomIndex = i

            if randomIndex == -1:
                randomIndex = 0

            modifiedSample[j] = speechTags[randomIndex]

        previousSample = modifiedSample[:]
        samples.append(previousSample)
        sampleMap['_'.join(previousSample)] += 1

    samples = samples[Constants.gibbs_burn_in_count:]
    return samples, sampleMap
def get_part_of_speech(sentense):
    # all the speeches from the train data
    speeches = Probabilities.speech_prob.keys()
    no_words = len(sentense)
    no_speech = len(speeches)

    # Holds backtrack path for trace back
    back_tracks = [[0] * (no_words + 1) for x in range(no_speech)]

    # maximum probabilities for each speech at each level
    max_probabilities = [[0] * (no_words + 1) for x in range(no_speech)]

    # initial probability calculation for first word
    first_word = sentense[0]

    # basic step
    for i in range(0, no_speech):
        max_probabilities[i][0] = Probabilities.get_first_speech_prob(speeches[i]) * Probabilities.get_word_probability(
            first_word, speeches[i]
        )
        back_tracks[i][0] = ""

    # recursive step
    for word_index in range(1, no_words):
        curr_word = sentense[word_index]
        for tag_index in range(0, no_speech):
            arg_max = -sys.maxint
            arg_bt = ""
            max_total_prob = -sys.maxint
            curr_tag = speeches[tag_index]
            word_prob = Probabilities.get_word_probability(curr_word, curr_tag)
            for prev_tag_index in range(0, no_speech):
                prev_tag = speeches[prev_tag_index]

                # calculate transition probability
                transition_prob = (
                    Probabilities.get_transition_prob(curr_tag, prev_tag)
                    * max_probabilities[prev_tag_index][word_index - 1]
                )

                if transition_prob > arg_max:
                    arg_max = transition_prob
                    arg_bt = prev_tag

                # total probability
                total_prob = transition_prob * word_prob

                if total_prob > max_total_prob:
                    max_total_prob = total_prob

            back_tracks[tag_index][word_index] = arg_bt
            max_probabilities[tag_index][word_index] = max_total_prob

    # terminal step, calculate speech for last word
    max_probabilities[no_speech - 1][no_words] = -1

    for i in range(0, no_speech):
        tag = speeches[i]
        last_prob = max_probabilities[i][no_words - 1] * Probabilities.get_last_speech_prob(tag)

        if max_probabilities[no_speech - 1][no_words] < last_prob:
            max_probabilities[no_speech - 1][no_words] = last_prob
            back_tracks[no_speech - 1][no_words] = tag

    # backtrack, get best path
    last_tag = back_tracks[no_speech - 1][no_words]

    solution = [last_tag]

    for word_index in range(no_words - 1, 0, -1):
        prev_tag_index = speeches.index(last_tag)
        last_tag = back_tracks[prev_tag_index][word_index]
        solution.append(last_tag)

    # Due to backtrack it will be in reverse, change it
    solution.reverse()

    # store result in result cache for future use
    result_cache.viterbi_result = solution[:]

    return [[solution], []]
Example #11
0
 def train(self, data):
     # create all probabilities required
     Probabilities.train_from_data(data)