Example #1
0
 def __init__(self):
     """Initializes the del_probs and ins_probs variables to empty MLE probability distributions,
     and the sub_probs to an empty conditional probability distribution."""
     self.del_probs = MLEProbDist(
         FreqDist()
     )  # a MLE probability distribution representing how likely each character is to be deleted
     self.ins_probs = MLEProbDist(
         FreqDist()
     )  # a MLE probability distribution representing how likely each character is to be inserted
     self.sub_probs = ConditionalProbDist(
         ConditionalFreqDist(), MLEProbDist
     )  # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character
Example #2
0
def recompute_cluster_dists(text, cluster_descr):
    c_freqs = FreqDist()
    for c in text.clusters(cluster_descr):
        c_freqs.inc(c)
    c_dist = MLEProbDist(c_freqs)

    c_bi_freqs = FreqDist()
    for bi_c in bigrams(text.clusters(cluster_descr)):
        c_bi_freqs.inc(bi_c)
    c_bi_dist = MLEProbDist(c_bi_freqs)

    return c_dist, c_bi_dist
Example #3
0
    def __init__(self, source, gen_func=lambda x: x):
        self.dictionary = Dictionary([gen_func(source)])
        self.gen_func = gen_func
        self.source = source

        self.word_freqs = FreqDist()
        for word in self.words():
            self.word_freqs.inc(word)

        self.word_dist = MLEProbDist(self.word_freqs)
Example #4
0
    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        @return: the trained model
        @rtype: HiddenMarkovModelTagger
        @param labelled_sequences: the training data, a set of
            labelled sequences of observations
        @type labelled_sequences: list
        @param kwargs: may include an 'estimator' parameter, a function taking
            a C{FreqDist} and a number of bins and returning a C{ProbDistI};
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurences of starting states, transitions out of each state
        # and output symbols observed in each state
        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts == None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in self._states:
                    self._states.append(state)
                if symbol not in self._symbols:
                    self._symbols.append(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, False, N)
        B = ConditionalProbDist(outputs, estimator, False, len(self._symbols))
                               
        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
Example #5
0
    def train_costs(self, alignments):
        """Given a list of character alignments, uses it to estimate the likelihood of different types of errors."""
        # find all of the deletions, insertions, and substitutions in the alignment list
        deletions = []
        insertions = []
        substitutions = []
        for alignment in alignments:
            fromChar = alignment[0]
            toChar = alignment[1]
            if ((fromChar == toChar) or (fromChar != '%' and toChar != '%')):
                substitutions.append(alignment)
            elif fromChar == '%':
                insertions.append(toChar)
            else:  # toChar == '%'
                deletions.append(fromChar)

        # use the result above to update the probability distributions scores in del_probs, ins_probs, and sub_probs
        self.del_probs = MLEProbDist(FreqDist(deletions))
        self.ins_probs = MLEProbDist(FreqDist(insertions))
        self.sub_probs = ConditionalProbDist(
            ConditionalFreqDist([(pair[0], pair[1])
                                 for pair in substitutions]), MLEProbDist)
        return
Example #6
0
def build_language_models(corpus_words):
    unigram = FreqDist(corpus_words)
    unigram_prob = MLEProbDist(unigram)
    bigram = ConditionalFreqDist(nltk.bigrams(corpus_words))
    bigram_prob = ConditionalProbDist(bigram, MLEProbDist)

    def lm_1(words):
        p = 1.0
        for w in words:
            p = p * unigram_prob.prob(w)
        return p

    def lm_2(words):
        p = 1.0
        previous_word = None
        for w in words:
            if previous_word is None:
                p *= unigram_prob.prob(w)
            else:
                p *= bigram_prob[previous_word].prob(w)
            previous_word = w
        return p

    return lm_1, lm_2
Example #7
0
class EditDistanceFinder():
    def __init__(self):
        """Initializes the del_probs and ins_probs variables to empty MLE probability distributions,
        and the sub_probs to an empty conditional probability distribution."""
        self.del_probs = MLEProbDist(
            FreqDist()
        )  # a MLE probability distribution representing how likely each character is to be deleted
        self.ins_probs = MLEProbDist(
            FreqDist()
        )  # a MLE probability distribution representing how likely each character is to be inserted
        self.sub_probs = ConditionalProbDist(
            ConditionalFreqDist(), MLEProbDist
        )  # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character

    def ins_cost(self, x):
        """Given a single character as input,
        returns a cost (between 0 and 1) of inserting that character."""
        ins_prob = self.ins_probs.prob(x)
        return float(1 - ins_prob)

    def del_cost(self, x):
        """Given two characters as input,
        returns a cost (between 0 and 1) of substituting the first character with the second character."""
        del_prob = self.del_probs.prob(x)
        return float(1 - del_prob)

    def sub_cost(self, x, y):
        """Given two characters as input,
        returns a cost (between 0 and 1) of substituting the first character (x) with the second character (y)."""
        if x == y:
            return 0.0
        else:
            return 2.0 * (1.0 - float(self.sub_probs[x].prob(y))
                          )  # order of x and y

    def align(self, start, end):
        """Given two words, returns a distance (as a float) and the corresponding character alignments
        (as a list of tuples of characters)."""
        numRows = len(start) + 1
        numColumns = len(end) + 1
        dptable = np.array(([[0] * numColumns] * numRows), dtype=object)

        # each cell in the dp table will consist of (cost, char befor modification, char after modification)
        # e.g. if the last action was to delete 'a' and the resulting cost is 10, (10, a, %)

        # base cases
        dptable[numRows - 1, 0] = (0.0, '%', '%')
        ## fill in the bottom row
        for i in range(1, numColumns):
            char = end[i - 1]
            cost = dptable[numRows - 1, i - 1][0] + self.ins_cost(char)
            dptable[numRows - 1, i] = (cost, '%', char)
        ## fill in the first column
        for j in range(numRows - 2, -1, -1):
            char = start[numRows - j - 2]
            cost = dptable[j + 1, 0][0] + self.del_cost(char)
            dptable[j, 0] = (cost, char, '%')

        # fill in the rest of the table
        newStart = "%" + start
        newEnd = "%" + end
        for row in range(numRows - 2, -1, -1):
            for col in range(1, numColumns):
                sub_cost = dptable[row + 1][col - 1][0] + self.sub_cost(
                    newStart[len(newStart) - row - 1], newEnd[col])
                del_cost = dptable[row + 1][col][0] + self.del_cost(
                    newStart[len(newStart) - row - 1])
                ins_cost = dptable[row][col - 1][0] + self.ins_cost(
                    newEnd[col])
                min_cost = min(sub_cost, del_cost, ins_cost)
                # find the move with the least cost and set fromChar and toChar accordingly
                if sub_cost == min_cost:
                    fromChar = newStart[len(newStart) - row - 1]
                    toChar = newEnd[col]
                elif del_cost == min_cost:
                    fromChar = newStart[len(newStart) - row - 1]
                    toChar = "%"
                elif ins_cost == min_cost:
                    fromChar = "%"
                    toChar = newEnd[col]
                dptable[row, col] = (min_cost, fromChar, toChar)

        # backtrace
        row = 0
        col = numColumns - 1
        path = []
        while (row != numRows - 1 or col != 0):
            fromChar = dptable[row][col][1]
            toChar = dptable[row][col][2]
            path.insert(0, (fromChar, toChar))
            # trace the last action and move to the prior cell
            ## if the prior move was to substitute
            if (fromChar == toChar) or (fromChar != '%' and toChar != '%'):
                row += 1
                col -= 1
            ## if the prior move was to insert
            elif (fromChar == '%'):
                col -= 1
            ## if the prior move was to delete
            else:
                row += 1

        return (dptable[0, numColumns - 1][0], path)

    def show_alignment(self, alignment):  # user has to feed an align result
        """Takes the alignments returned by align and print them in a friendly way."""
        string1 = [a[0] for a in alignment]
        string2 = [a[1] for a in alignment]
        print("String1:", ' '.join(string1))
        print("String2:", ' '.join(string2))
        return

    def train(self, file):
        """Given a file name, reads in the file and split it into a list of tuples,
        e.g. [(misspelling1, correctspelling1), (misspelling2, correctspelling2), ...],
        then iteratively call train_alignments and train_costs repeatedly until the model converges."""
        pairs = [(pair[0], pair[1]) for pair in [
            sentence.strip('\n').split(',')
            for sentence in open(file).readlines()
        ]]
        prior = None
        converged = False
        while not converged:
            print("Converging...")
            alignments = self.train_alignments(pairs)
            self.train_costs(alignments)
            # check for convergence
            if alignments == prior:
                converged = True
            prior = alignments
        return

    def train_alignments(self, misspellings):
        """Given a list of misspellings like the one returned by train, calls align on each of the (misspelling, correctspelling) pairs,
        and returns a single list with all of the character alignments from all of the pairs."""
        align_list = []
        for i in range(len(misspellings)):
            align_list += self.align(misspellings[i][0], misspellings[i][1])[1]
        return align_list

    def train_costs(self, alignments):
        """Given a list of character alignments, uses it to estimate the likelihood of different types of errors."""
        # find all of the deletions, insertions, and substitutions in the alignment list
        deletions = []
        insertions = []
        substitutions = []
        for alignment in alignments:
            fromChar = alignment[0]
            toChar = alignment[1]
            if ((fromChar == toChar) or (fromChar != '%' and toChar != '%')):
                substitutions.append(alignment)
            elif fromChar == '%':
                insertions.append(toChar)
            else:  # toChar == '%'
                deletions.append(fromChar)

        # use the result above to update the probability distributions scores in del_probs, ins_probs, and sub_probs
        self.del_probs = MLEProbDist(FreqDist(deletions))
        self.ins_probs = MLEProbDist(FreqDist(insertions))
        self.sub_probs = ConditionalProbDist(
            ConditionalFreqDist([(pair[0], pair[1])
                                 for pair in substitutions]), MLEProbDist)
        return
Example #8
0
]

for elem in dataRaw_tokens_nopunct:
    if elem == 's':
        dataRaw_tokens_nopunct.remove(elem)
for elem in dataRaw_tokens_nopunct:
    if elem == '/s':
        dataRaw_tokens_nopunct.remove(elem)
dataRaw_fdist = FreqDist(dataRaw_tokens_nopunct)
##xx = dataRaw_fdist.most_common()
vocabRaw_tokens_nopunct = [
    word for word in word_tokenize(vocabRaw) if re.search("\w", word)
]

# calculate the possibility distribution
dataRaw_pdist = MLEProbDist(dataRaw_fdist)
#yy = [(x, dataRaw_pdist.prob(x)) for x in dataRaw_pdist.samples()]
#yy =(aa, dataRaw_pdist.prob(aa))

# print possibility of word
wordPos = [(x, dataRaw_pdist.prob(x)) for x in vocabRaw_tokens_nopunct]

# print possibility of UNK
KPos = 0
for y in vocabRaw_tokens_nopunct:
    KPos += dataRaw_pdist.prob(y)
UNKPos = [('UNK', (1 - KPos))]

wordPos.append(UNKPos[0])
#print(wordPos)
#print('UNK, ',UNKPos)
Example #9
0
 def tag_prob(self):
     return MLEProbDist(self.t_freq)
Example #10
0
            finalOutput.append("UNK")  # append UNK as value


def printOutContent(input):
    for a, b in input:  # prints out tuple contents
        print(a + ":" + str(b), end=" ")  # on the same line


theData = finalOutput

# UNIGRAM
fdist1 = FreqDist(theData) + FreqDist({"UNK": 0})
# initialises frequency distribution and adds a frequency of 0  for UNK
# however unseen events get a value of zero and don't get smoothed...
# Unsmoothed
unSmoothed = MLEProbDist(fdist1)  # initialises probability distribution
unSmoothProb = [(x, unSmoothed.prob(x)) for x in unSmoothed.samples()]

# Smoothed
Smoothed = LaplaceProbDist(fdist1)
SmoothedProb = [(x, Smoothed.prob(x)) for x in Smoothed.samples()]

# QUESTION 5
# BIGRAM
bigram = list(nltk.ngrams(theData, 2))
fdist2 = FreqDist(bigram)

# Unsmoothed
unSmoothedBigram = MLEProbDist(fdist2)
unSmoothedBigramProb = [(x, unSmoothedBigram.prob(x))
                        for x in unSmoothedBigram.samples()]
Example #11
0
from __future__ import print_function
from nltk.metrics import *

reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
test    = 'DET VB VB DET NN NN NN IN DET NN'.split()
print(accuracy(reference, test))

reference_set = set(reference)
test_set = set(test)
precision(reference_set, test_set)
print(recall(reference_set, test_set))
print(f_measure(reference_set, test_set))

from nltk import FreqDist, MLEProbDist
pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf"))
pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss"))
print(log_likelihood(['a', 'd'], [pdist1, pdist2]))

edit_distance("rain", "shine")

s1 = set([1,2,3,4])
s2 = set([3,4,5])
binary_distance(s1, s2)
print(jaccard_distance(s1, s2))
print(masi_distance(s1, s2))

spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3})

s1 = "000100000010"
s2 = "000010000100"
Example #12
0
def mle_of_tags(samples):
    set_samples = set(samples) #so it won't be n times of the same rule. easy to use later
    mle_samples = MLEProbDist(FreqDist(samples))
    probs = {sample : mle_samples.prob(sample) for sample in set_samples}
    return probs