Example #1
0
class DocumentParser(object):
    def __init__(self, language):
        self._tokenizer = Tokenizer(language)
        self.language = language

    def tokenize_sentences(self, text):
        return self._tokenizer.to_sentences(text)

    def tokenize_words(self, sentence):
        return self._tokenizer.to_words(sentence)
Example #2
0
    def test_magnitude(self):
        tokenizer_en = Tokenizer("english")
        tokenizer_cn = Tokenizer("chinese")
        text_en = "i am very happy"
        text_cn = "我来到北京清华大学"
        model_en = TfDocumentModel(text_en, tokenizer_en)
        model_cn = TfDocumentModel(text_cn, tokenizer_cn)

        self.assertAlmostEqual(model_en.magnitude, 2.0)
        self.assertAlmostEqual(model_cn.magnitude, 2.0)
Example #3
0
def build_document_from_string(string="", language="english"):
    sentences = []
    paragraphs = []
    tokenizer = Tokenizer(language)

    for line in string.strip().splitlines():
        line = line.lstrip()
        if line:
            sentence_tuple = tokenizer.to_sentences(line)
            for sentence in sentence_tuple:
                sentences.append(build_sentence(sentence, language))
            paragraphs.append(Paragraph(sentences))
            sentences = []
        else:
            continue
    return Document(paragraphs)
Example #4
0
    def test_terms(self):
        tokenizer = Tokenizer("english")
        text = "wA wB wC wD wB wD wE"
        model = TfDocumentModel(text, tokenizer)

        terms = tuple(sorted(model.terms))
        self.assertEqual(terms, ("wa", "wb", "wc", "wd", "we"))
Example #5
0
    def test_ensure_tokenizer_available(self):
        tokenizer_en = Tokenizer("english")
        self.assertEqual("english", tokenizer_en.language)
        tokenizer_cn = Tokenizer("chinese")
        self.assertEqual("chinese", tokenizer_cn.language)

        sentence_en = "You are very beautiful."
        sentence_cn = "我来到北京清华大学"

        expected_en_words = (
            "You",
            "are",
            "very",
            "beautiful",
        )

        expected_cn_words = (
            "我",
            "来到",
            "北京",
            "清华大学",
        )

        self.assertEqual(expected_en_words, tokenizer_en.to_words(sentence_en))
        self.assertEqual(expected_cn_words, tokenizer_cn.to_words(sentence_cn))
Example #6
0
    def test_most_frequent_terms(self):
        tokenizer = Tokenizer("english")
        text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE"
        model = TfDocumentModel(text, tokenizer)

        self.assertEqual(model.most_frequent_terms(1), ("we", ))
        self.assertEqual(model.most_frequent_terms(2), ("we", "wd"))
        self.assertEqual(model.most_frequent_terms(3), ("we", "wd", "wc"))
        self.assertEqual(model.most_frequent_terms(4),
                         ("we", "wd", "wc", "wb"))
        self.assertEqual(model.most_frequent_terms(5),
                         ("we", "wd", "wc", "wb", "wa"))
        self.assertEqual(model.most_frequent_terms(),
                         ("we", "wd", "wc", "wb", "wa"))
Example #7
0
    def test_ensure_segment_available(self):
        tokenizer_en = Tokenizer("english")
        tokenizer_cn = Tokenizer("chinese")

        sentence_en = "There are two sentences here. Am I right?"
        sentence_cn = "这里貌似有两句话。不知道我说的对不对。"

        self.assertEqual(len(tokenizer_en.to_sentences(sentence_en)), 2)
        self.assertEqual(len(tokenizer_cn.to_sentences(sentence_cn)), 2)
Example #8
0
    def test_term_frequency(self):
        tokenizer = Tokenizer("english")
        text = "wA wB wC wA wA wC wD wCwB"
        model = TfDocumentModel(text, tokenizer)

        self.assertEqual(model.term_frequency("wa"), 3)
        self.assertEqual(model.term_frequency("wb"), 1)
        self.assertEqual(model.term_frequency("wc"), 2)
        self.assertEqual(model.term_frequency("wd"), 1)
        self.assertEqual(model.term_frequency("wcwb"), 1)
        self.assertEqual(model.term_frequency("we"), 0)
        self.assertEqual(model.term_frequency("missing"), 0)
        self.assertAlmostEqual(model.term_frequency_normalized("missing"), 0)
        self.assertAlmostEqual(model.term_frequency_normalized("wa", "max"),
                               1.0)
        self.assertAlmostEqual(model.term_frequency_normalized("wa"), 3 / 8)
Example #9
0
    def test_sentence_equal(self):
        sentence_one = Sentence("", Tokenizer("english"))
        sentence_two = Sentence("", Tokenizer("english"))
        self.assertEqual(sentence_one, sentence_two)

        sentence_one = Sentence("another example", Tokenizer("english"))
        sentence_two = Sentence("another example", Tokenizer("english"))
        self.assertEqual(sentence_one, sentence_two)

        sentence_one = Sentence("example", Tokenizer("english"))
        sentence_two = Sentence("another", Tokenizer("english"))
        self.assertNotEqual(sentence_one, sentence_two)
Example #10
0
 def __init__(self, language):
     self._tokenizer = Tokenizer(language)
     self.language = language
Example #11
0
def build_sentence(sentence_as_string="", language="english"):
    tokenizer = Tokenizer(language)
    return Sentence(sentence_as_string, tokenizer)
Example #12
0
    def test_most_frequent_terms_negative_count(self):
        tokenizer = Tokenizer("english")
        model = TfDocumentModel("text", tokenizer)

        self.assertRaises(ValueError, model.most_frequent_terms, -1)
Example #13
0
    def test_most_frequent_terms_empty(self):
        tokenizer = Tokenizer("english")
        model = TfDocumentModel("", tokenizer)

        self.assertEqual(model.most_frequent_terms(), ())
        self.assertEqual(model.most_frequent_terms(10), ())