Ejemplo n.º 1
0
 def test_add_one_word(self):
     """Assert that adding a single word to a newly constructed Vocabulary works as expected"""
     v = Vocabulary()
     i = v.get_int("hello")
     self.assertEqual(i, 0)
     self.assertEqual(v.size(), 1)
     self.assertEqual(v.words(), ["hello"])
     self.assertEqual(v.get_int("hello"), i)
     self.assertEqual(v.get_word(0), "hello")
Ejemplo n.º 2
0
 def add(self, e, f):
     words_e = nltk.word_tokenize(e)
     words_f = nltk.word_tokenize(f)
     sent_e, sent_f = list(), list()
     for single_e in words_e:
         sent_e.append(Vocabulary.get_int(self.e_vocab, single_e))
     for single_f in words_f:
         sent_f.append(Vocabulary.get_int(self.f_vocab, single_f))
     self.e.append(sent_e)
     self.f.append(sent_f)
Ejemplo n.º 3
0
    def test_adding_words(self):
        """Assert that words are properly added to the vocabulary"""
        v = Vocabulary()
        tokens = "Four score and seven years ago".split()
        ints = list()
        for token in tokens:
            ints.append(v.get_int(token))

        self.assertEqual(v.words(), tokens)

        for token in tokens:
            i = v.get_int(token)
            self.assertNotEqual(i, None)
            t = v.get_word(i)
            self.assertEqual(t, token)

        for i in range(0, len(tokens)):
            self.assertNotEqual(i, None)
            t = v.get_word(i)
            self.assertEqual(t, tokens[i])
Ejemplo n.º 4
0
    def add(self, e, f):
        #tokenizing the sentences
        english_words = nltk.word_tokenize(e)
        foreign_words = nltk.word_tokenize(f)

        #creating lists to store variables
        english_list = list()
        foreign_list = list()

        #going through each word of sentence to find the corresponding int and append to list
        for english_word in english_words:
            cur_int = Vocabulary.get_int(self.e_vocab, english_word)
            english_list.append(cur_int)

        for foreign_word in foreign_words:
            cur_int = Vocabulary.get_int(self.f_vocab, foreign_word)
            foreign_list.append(cur_int)

        #appending list to of int to self.? 
        self.e.append(english_list)
        self.f.append(foreign_list)
Ejemplo n.º 5
0
def create_vocab(words):
    v = Vocabulary()
    for word in words:
        v.get_int(word)
    return v
Ejemplo n.º 6
0
class ParallelCorpus:

    # Define a constructor
    def __init__(self):

        # List of English sentences. Each sentence will be represented as a list of ints.
        self.e = list()

        # List of foreign sentences  Each sentence will be represented as a list of ints.
        self.f = list()

        # Initially empty vocabularies
        self.e_vocab = Vocabulary()
        self.f_vocab = Vocabulary()

    # Returns the number of sentence pairs that have been added to this parallel corpus
    def size(self):
        return len(self.e)

    # Returns the list of integers corresponding to the English sentence at the specified sentence index
    def get_e(self, sentence_index):
        return self.e[sentence_index]

    # Returns the list of integers corresponding to the foreign sentence at the specified sentence index
    def get_f(self, sentence_index):
        return self.f[sentence_index]

    # Given a string representing an English sentence
    #   and a string representing a foreign sentence,
    #   tokenize each string using nltk.word_tokenize,
    #   and use the appropriate vocabulary to convert each token to an int.
    #
    # Append the list of integers (corresponding to the English sentence) to self.e
    # Append the list of integers (corresponding to the foreign sentence) to self.f
    def add(self, e, f):
        wordlist = []  #this is to get a list for each sentence of ints
        words = nltk.word_tokenize(e)
        for word in words:
            wordlist.append(self.e_vocab.get_int(word))
        self.e.append(wordlist)
        wordlist = []  #this is to get a list for each sentence of ints
        words = nltk.word_tokenize(f)
        for word in words:
            wordlist.append(self.f_vocab.get_int(word))
        self.f.append(wordlist)

    # Construct a conditional distribution with the given name.
    #
    # Use the formula given in the supplementary instructions
    def create_uniform_distribution(self, name):
        initial_prob = 1.0 / self.f_vocab.size()  #init probability, i.e 25%
        return Conditional(name, self.e_vocab, self.f_vocab, initial_prob)

    # Given a sentence index, a scaling factor epsilon, and a conditional distribution,
    #    calculate the conditional probability
    #    of the English sentence (at that sentence index)
    #    given the foreign sentence (at that sentence index)
    #
    # Use the formula given in the supplementary instructions
    def conditional_probability(self, sentence_index, epsilon, conditional):
        le = len(self.e[sentence_index])
        lf = len(self.f[sentence_index])
        tsum = 0
        for j in range(le):
            for i in range(lf):
                tsum = tsum + conditional.get(j, i)
        lftole = math.pow(lf, le)
        p = epsilon / lftole * tsum
        return p

    # Given a conditional distribution and a scaling factor epsilon,
    #    calculate the perplexity of this parallel corpus.
    #
    # Use the formula given in the supplementary instructions
    def perplexity(self, epsilon, conditional):
        s = self.size()
        pp = 0
        for i in range(s):
            p = self.conditional_probability(i, epsilon, conditional)
            pp = pp + math.log(p, 2)
        pp = -pp
        return pp