Esempio n. 1
0
 def test_get_concordances_one_match_with_punctuation(self):
     text = u"Aaa bbbb, cc dddd number eeee fff. Aa bbb cccc ddd."
     text = Text.prepare(text)
     tokens = nltk.word_tokenize(text)
     params = {
         "width": 26,
         "regex_escape_chars": "?.*$+()[]",
         "regex_is_char": "[\\w'\\-]",
         "regex_start_right_concordance": "{0}\\s*{1}\\s*",
         "regex_start_right_one_token_concordance": "{0}\\s*{1}",
         "regex_end_left_concordance": "{0}\\s*{1}",
         "regex_end_right_concordance": "{0}",
         "regex_other_concordance": "{0}\\s*",
         "partitons": 1
     }
     conc = ConcordanceWidth(tokens=tokens, params=params)
     conc.load_text(text)
     success = [
         {
             "citation": ", cc dddd number eeee fff.",
             "word": "number",
             "citation_length":  len(", cc dddd number eeee fff.")
         }
     ]
     offsets = conc.get_offsets("number")
     i = 0
     for concordance in conc.get_concordances("number", offsets):
         self.assertTrue(success[i] == concordance)
         i += 1
 def test_get_concordances_sentence_three(self):
     text = "This is my sentence one about  my number one.\nNow end with number. Number at start too.  This is sentence four that does not have it.\n\tThis is my number,it does have it also."
     text = Text.prepare(text)
     tokens = nltk.word_tokenize(text)
     params = {
         "sentences": 3,
         "regex_escape_chars": "?.*$+()[]",
         "regex_is_char": "[\\w'\\-]",
         "regex_start_right_concordance": "{0}\\s*{1}\\s*",
         "regex_start_right_one_token_concordance": "{0}\\s*{1}",
         "regex_end_left_concordance": "{0}\\s*{1}",
         "regex_end_right_concordance": "{0}",
         "regex_other_concordance": "{0}\\s*",
         "partitons": 1
     }
     conc = ConcordanceSentences(tokens=tokens, params=params)
     conc.load_text(text)
     success = [{
         "citation":
         u"This is my sentence one about my number one. Now end with number.",
         "word":
         "number",
         "citation_length":
         len("This is my sentence one about my number one. Now end with number."
             )
     }, {
         "citation":
         u"This is my sentence one about my number one. Now end with number. Number at start too.",
         "word":
         "number",
         "citation_length":
         len("This is my sentence one about my number one. Now end with number. Number at start too."
             )
     }, {
         "citation":
         u"Now end with number. Number at start too. This is sentence four that does not have it.",
         "word":
         "number",
         "citation_length":
         len("Now end with number. Number at start too. This is sentence four that does not have it."
             )
     }, {
         "citation":
         u"This is sentence four that does not have it. This is my number, it does have it also.",
         "word":
         "number",
         "citation_length":
         len("This is sentence four that does not have it. This is number 5 that does have it also."
             )
     }]
     offsets = conc.get_offsets("number")
     i = 0
     for concordance in conc.get_concordances("number", offsets):
         self.assertTrue(success[i] == concordance)
         i += 1
Esempio n. 3
0
 def test_prepare_punctuation_and_white_space(self):
     text = "I am a test case.My formatting is sloppy...I keep leaving out a space after a full stop,my email is [email protected]."
     success = u"I am a test case. My formatting is sloppy... I keep leaving out a space after a full stop, my email is [email protected]."
     self.assertEqual(success, Text.prepare(text))
Esempio n. 4
0
 def test_prepare_white_space(self):
     text_windows = "I am a test case.\r\nMy carriage returns  and\twhite \tspace shopuld be changed."
     text_linux = "I am a test case.\nMy carriage returns\n\nand\twhite   space shopuld be changed."
     success = u"I am a test case. My carriage returns and white space shopuld be changed."
     self.assertEqual(success, Text.prepare(text_windows))
     self.assertEqual(success, Text.prepare(text_linux))