def test_simple(self): expected1 = {"ratio_to_passive_verbs": 0.0} expected2 = {"ratio_to_passive_verbs": 0.25} a1 = translationese.Analysis( """Tomorrow i will be there to answer your call""") a2 = translationese.Analysis( """If It can be bought for less money, I will go and buy it""") self.assertEqual(expected1, ratio_to_passive_verbs.quantify(a1)) self.assertEqual(expected2, ratio_to_passive_verbs.quantify(a2))
def test_simple(self): expected1 = {"lexical_density" : (7.0) / (15.0)} expected2 = {"lexical_density" : 0.5 } expected3 = {"lexical_density" : (3.0) / (4.0) } a1 = translationese.Analysis("""I'm certain he said I am later than I am, let's go""") a2 = translationese.Analysis("""we will go tomorrow to see a movie""") a3 = translationese.Analysis("""What about punctuation?""") self.assertEqual(expected1, lexical_density.quantify(a1)) self.assertEqual(expected2, lexical_density.quantify(a2)) self.assertEqual(expected3, lexical_density.quantify(a3))
def test_simple(self): a = translationese.Analysis("""Hello hello world world.""") quantifier = lexical_variety.LexicalVarietyQuantifier(a) self.assertAlmostEqual(6 * (3 / 5.0), quantifier.type_token_ratio(), 1) self.assertAlmostEqual(6 * math.log(3) / math.log(5.0), quantifier.log_type_token_ratio(), 1) self.assertAlmostEqual(100 * math.log(5.0) / (1 - 1 / 3.0), quantifier.unique_type_token_ratio(), 1)
def test_bigrams(self): a = translationese.Analysis("Which witch should watch which witch watch?") self.assertDictEqual(a.bigrams(), { ("which", "witch"): 2, ("witch", "should"): 1, ("should", "watch"): 1, ("watch", "which"): 1, ("witch", "watch"): 1, ("watch", "?"): 1, })
def analyze_file(filename, analyzer_module, variant=None, stanfordnlp=None, lang=None): with translationese.Analysis(filename=filename, stanfordnlp=stanfordnlp, lang=lang) as analysis: if variant is not None: return analyzer_module.quantify_variant(analysis, variant) else: return analyzer_module.quantify(analysis)
def test_quantifier_variants(self): a = translationese.Analysis("""Hello hello world world.""") result = lexical_variety.quantify_variant(a, 0) expected = {"TTR1": 6 * (3 / 5.0)} self.assertDictEqual(expected, result) result = lexical_variety.quantify_variant(a, 1) expected = {"TTR2": 6 * math.log(3) / math.log(5.0)} self.assertDictEqual(expected, result) result = lexical_variety.quantify_variant(a, 2) expected = {"TTR3": 100 * math.log(5.0) / (1 - 1 / 3.0)} self.assertDictEqual(expected, result)
def test_simple(self): sentence = "How much wood would a woodchuck chuck if a " \ "woodchuck could chuck wood?" a = translationese.Analysis(sentence) self.assertEquals( { "how": 1 / 14.0, "much": 1 / 14.0, "wood": 2 / 14.0, "would": 1 / 14.0, "a": 2 / 14.0, "woodchuck": 2 / 14.0, "if": 1 / 14.0, "chuck": 2 / 14.0, "could": 1 / 14.0, "?": 1 / 14.0, }, a.histogram_normalized() )
def test_pmi(self): sentence = "a b a b b" analysis = translationese.Analysis(sentence) # White box test so this can be followed externally self.assertDictEqual(analysis.bigrams(), { ("a","b"): 2, # freq = 0.5 ("b","a"): 1, # freq = 0.25 ("b","b"): 1, # freq = 0.25 }) self.assertDictEqual(analysis.histogram_normalized(), { "a": 2/5.0, "b": 3/5.0, }) expected_pmi = { ("a","b"): math.log(0.5 / (2/5.0 * 3/5.0)), ("b","a"): math.log(0.25 / (3/5.0 * 2/5.0)), ("b","b"): math.log(0.25 / (3/5.0 * 3/5.0)), } expected_logs = { ("a","b"): 0.73397, ("b","a"): 0.04082, ("b","b"): -0.36464, } expected_average = sum(expected_logs.values()) / 3.0 self.assertAlmostEqual(0.13672, expected_average, 5) actual_pmi = analysis.pmi() for i in expected_pmi.keys(): self.assertAlmostEqual(expected_pmi[i], expected_logs[i], 5) self.assertAlmostEqual(expected_pmi[i], actual_pmi[i], 5)
def test_infinite_uniqueness(self): a = translationese.Analysis("Only different words.") quantifier = lexical_variety.LexicalVarietyQuantifier(a) self.assertEqual(float("infinity"), quantifier.unique_type_token_ratio())