def test_CalcTriProbs(self):
        testdict3 = {
            'alpha zulu bravo': 1,
            'bravo yankee charlie': 4,
            'charlie xray alpha': 5
        }
        testdict2 = {'alpha zulu': 3, 'bravo yankee': 2, 'charlie xray': 7}
        testdict1 = {
            'alpha': 6,
            'zulu': 3,
            'bravo': 9,
            'yankee': 2,
            'charlie': 12,
            'xray': 7
        }

        NGramsObj = ngrams.NGrams()
        NGramsObj.tri_dict = testdict3
        NGramsObj.bi_dict = testdict2
        NGramsObj.uni_dict = testdict1
        actualResult = NGramsObj.calc_tri_prob()
        #    print actualResult
        self.assertTrue(actualResult == [[
            5, 0.7142857142857143, -0.146128035678238, 'charlie xray alpha'
        ], [
            4, 2.0, 0.3010299956639812, 'bravo yankee charlie'
        ], [1, 0.3333333333333333, -0.47712125471966244, 'alpha zulu bravo']])
def main():
  input_file = sys.argv[1]
  input_t = open(input_file)
  
  NGramsObj = ngrams.NGrams()
  
  t = input_t.readline()
  while t:
    NGramsObj.read_into_dicts(t)
    t = input_t.readline()


  print "\data\\"  
  print "ngram 1: type=" + str(NGramsObj.count_types_tokens(NGramsObj.uni_dict)[0]) + " token=" + str(NGramsObj.count_types_tokens(NGramsObj.uni_dict)[1])
  print "ngram 2: type=" + str(NGramsObj.count_types_tokens(NGramsObj.bi_dict)[0]) + " token=" + str(NGramsObj.count_types_tokens(NGramsObj.bi_dict)[1])
  print "ngram 3: type=" + str(NGramsObj.count_types_tokens(NGramsObj.tri_dict)[0]) + " token=" + str(NGramsObj.count_types_tokens(NGramsObj.tri_dict)[1])
 
  print '\n', "\\1-grams:"
  for tup in NGramsObj.calc_uni_prob():  
    print tup[0], tup[1], tup[2], tup[3]     

  print '\n', "\\2-grams:"
  for tup in NGramsObj.calc_bi_prob():  
    print tup[0], tup[1], tup[2], tup[3]     

  print '\n', "\\3-grams:"
  for tup in NGramsObj.calc_tri_prob(): 
    print tup[0], tup[1], tup[2], tup[3]     



  input_t.close()
    def test_CalcUniProbs(self):
        testdict = {'a': 3, 'b': 2, 'c': 7}

        NGramsObj = ngrams.NGrams()
        NGramsObj.uni_dict = testdict
        actualResult = NGramsObj.calc_uni_prob()
        # print actualResult
        self.assertTrue(actualResult ==
                        [[7, 0.5833333333333334, -0.23408320603336796, 'c'],
                         [3, 0.25, -0.6020599913279624, 'a'],
                         [2, 0.16666666666666666, -0.7781512503836436, 'b']])
    def test_unigrams(self):
        testSent = """<s> John likes Mary </s>"""

        NGramsObj = ngrams.NGrams()
        #    print testSent
        actualResult = NGramsObj.count_unigrams(testSent)
        #    print actualResult
        self.assertTrue(actualResult == [('<s>', 1), ('John',
                                                      1), ('</s>',
                                                           1), ('likes',
                                                                1), ('Mary',
                                                                     1)])
    def test_readIntoDict(self):
        teststr1 = "100 but"
        teststr2 = "100 but if"
        teststr3 = "100 but what if"

        NGramsObj = ngrams.NGrams()
        NGramsObj.read_into_dicts(teststr1)
        NGramsObj.read_into_dicts(teststr2)
        NGramsObj.read_into_dicts(teststr3)
        self.assertTrue(NGramsObj.uni_dict["but"] == 100)
        self.assertTrue(NGramsObj.bi_dict["but if"] == 100)
        self.assertTrue(NGramsObj.tri_dict["but what if"] == 100)
Exemple #6
0
def main():
    lm_file = sys.argv[1]
    lm_input = open(lm_file)

    # create NGrams and read into dictionary
    NGramsObj = ngrams.NGrams()

    inputLine = lm_input.readline()
    inputLineIdx = 0
    while inputLine:

        if inputLineIdx < 6:
            inputLineIdx = inputLineIdx + 1
            inputLine = lm_input.readline()
            continue

        NGramsObj.read_lm_file_into_dicts(inputLine)
        inputLine = lm_input.readline()

    lm_input.close()

    l1 = float(sys.argv[2])
    l2 = float(sys.argv[3])
    l3 = float(sys.argv[4])

    training_file = sys.argv[5]
    training_input = open(training_file)

    trainingLine = training_input.readline()
    i = 1
    while trainingLine:
        perplexity = NGramsObj.Perplexity(buildLine(trainingLine), l1, l2, l3,
                                          i)

        #  print trainingLine + ' : ' + str(perplexity)

        trainingLine = training_input.readline()

        i += 1

    training_input.close()

    print '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%'

    print 'sent_num=' + str(NGramsObj.sentNum) + ' word_num=' + str(
        NGramsObj.wordNum) + ' oov_num=' + str(NGramsObj.oovNum)
    print 'lgprob=' + str(NGramsObj.logProbs) + ' ave_lgprob=' + str(
        NGramsObj.avg_lgprob()) + ' ppl=' + str(NGramsObj.ppl_calc())
    def test_trigrams(self):
        testSent = """<s> I love my cat . </s>
<s> my cat loves me . </s>"""

        NGramsObj = ngrams.NGrams()
        #    print testSent
        actualResult = NGramsObj.count_trigrams(testSent)
        #   print actualResult
        self.assertTrue(actualResult == [('cat . </s>', 1), (
            'loves me .',
            1), ('love my cat',
                 1), ('me . </s>', 1), ('my cat .',
                                        1), ('my cat loves',
                                             1), ('<s> my cat',
                                                  1), ('cat loves me',
                                                       1), ('<s> I love',
                                                            1), ('I love my',
                                                                 1)])
    def test_readLMIntoDict(self):
        teststr1 = "1000 0.0391374114516 -1.40740790197 <s>"
        teststr2 = ""
        teststr3 = "1 0.2 -0.698970004336 executives drooled"
        teststr4 = "\\3-grams:"
        teststr5 = "1 0.333333333333 -0.47712125472 20 % and"

        NGramsObj = ngrams.NGrams()
        NGramsObj.read_lm_file_into_dicts(teststr1)
        NGramsObj.read_lm_file_into_dicts(teststr2)
        NGramsObj.read_lm_file_into_dicts(teststr3)
        NGramsObj.read_lm_file_into_dicts(teststr4)
        NGramsObj.read_lm_file_into_dicts(teststr5)
        self.assertTrue(NGramsObj.uni_dict["<s>"] == "0.0391374114516")
        self.assertTrue(NGramsObj.read_into_dicts(teststr2) == None)
        self.assertTrue(NGramsObj.bi_dict["executives drooled"] == "0.2")
        self.assertTrue(NGramsObj.read_into_dicts(teststr4) == None)
        self.assertTrue(NGramsObj.tri_dict["20 % and"] == "0.333333333333")
    def test_CalcBiProbs(self):
        testdict2 = {'alpha zulu': 3, 'bravo yankee': 2, 'charlie xray': 7}
        testdict1 = {
            'alpha': 6,
            'zulu': 3,
            'bravo': 9,
            'yankee': 2,
            'charlie': 12,
            'xray': 7
        }

        NGramsObj = ngrams.NGrams()
        NGramsObj.bi_dict = testdict2
        NGramsObj.uni_dict = testdict1
        actualResult = NGramsObj.calc_bi_prob()
        #    print actualResult
        self.assertTrue(actualResult == [[
            7, 0.5833333333333334, -0.23408320603336796, 'charlie xray'
        ], [3, 0.5, -0.3010299956639812, 'alpha zulu'
            ], [2, 0.2222222222222222, -0.6532125137753437, 'bravo yankee']])
    def test_readLMIntoDict_1(self):
        teststr = "<s> Influential members of the House Ways and Means Committee introduced legislation that would restrict how the new savings-and-loan bailout agency can raise capital , creating another potential obstacle to the government 's sale of sick thrifts . </s>"
        l1 = 0.2
        l2 = 0.3
        l3 = 0.5

        NGramsObj = ngrams.NGrams()

        idx = 0
        fileStream = open("../examples/lm_ex")
        t = fileStream.readline()
        while t:
            idx += 1
            if idx < 6:
                t = fileStream.readline()
                continue
            NGramsObj.read_lm_file_into_dicts(t)
            t = fileStream.readline()

        perp = NGramsObj.Perplexity(teststr, l1, l2, l3)
        print perp
        self.assertTrue(perp == (-82.8860891791949, 37))
def main():
  input_file = sys.argv[1]
  input_t = open(input_file)
#  t = input_t.read()
  
  NGramsObj = ngrams.NGrams()
  text = ""
  t = input_t.readline()
  while t:
    text = text + NGramsObj.BOS_EOS(t)
    t = input_t.readline()

  for tup in NGramsObj.count_unigrams(text):  # unigrams
    print tup[1], '\t', tup[0]    # print count and 1-gram 

  for tup in NGramsObj.count_bigrams(text):  # bigrams
    print tup[1], '\t', tup[0]    # print count and 2-gram 

  for tup in NGramsObj.count_trigrams(text):  # trigrams
    print tup[1], '\t', tup[0]    # print count and 3-gram

  input_t.close()
    def test_BOS_EOS(self):
        testSent1 = "Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 ."
        testSent2 = "Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group ."
        testSent3 = "Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate ." ""

        NGramsObj = ngrams.NGrams()
        #    print testSent
        actualResult1 = NGramsObj.BOS_EOS(testSent1)
        #    print actualResult1
        self.assertTrue(
            actualResult1 ==
            "<s> Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 . </s>"
            + '\n')
        actualResult2 = NGramsObj.BOS_EOS(testSent2)
        self.assertTrue(
            actualResult2 ==
            "<s> Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group . </s>"
            + '\n')
        actualResult3 = NGramsObj.BOS_EOS(testSent3)
        self.assertTrue(
            actualResult3 ==
            "<s> Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate . </s>"
            + '\n')
    def test_countTypesTokens(self):
        testdict = {'a': 3, 'b': 2, 'c': 7}

        NGramsObj = ngrams.NGrams()
        actualResult = NGramsObj.count_types_tokens(testdict)
        self.assertTrue(actualResult == (3, 12))
    def test_TrigramReadIntoDict(self):
        teststr = "100 but what if"

        NGramsObj = ngrams.NGrams()
        NGramsObj.read_into_dicts(teststr)
        self.assertTrue(NGramsObj.tri_dict["but what if"] == 100)
    def test_UnigramreadIntoDict(self):
        teststr = "100 but"

        NGramsObj = ngrams.NGrams()
        NGramsObj.read_into_dicts(teststr)
        self.assertTrue(NGramsObj.uni_dict["but"] == 100)