예제 #1
0
 def test_real_sentences(self):
     string = 'ومن وقتاش رجعت تحكي معاه المدير؟'
     p = parse_string(string)
     expected_output = [('و', 'C'), ('من', 'N'), ('وقتاش', 'INTEROG'), ('رجعت', 'VBD'),
                        ('تحكي', 'VBZ'), ('معا', 'N'), ('ه', 'PRO'), ('ال', 'DET'),
                        ('مدير', 'N'), ('؟', 'PUNCT')]
     self.assertEqual(expected_output, p)
예제 #2
0
 def test_multiword_string(self):
     string = 'هذه العربية و هذه الmixed!'
     p = parse_string(string)
     expected_output = [('هذه', 'DEM'), ('ال', 'DET'), ('عربية', 'N'), ('و', 'C'),
         ('هذه', 'DEM'), ('ال', 'N'), ('mixed', 'FW'), ('!', 'PUNCT')]
     # 'N' instead of 'DET' for second 'ال' is an expected failure, since an
     # isolated determiner is not an expected word type
     self.assertEqual(expected_output, p)
예제 #3
0
def evaluate_parser_segmentation(filename='data/segmentation_gold.txt'):
    """
    Evaluates results of word segmentation.
    :param filename: A txt file with arabic text with morphologic boundaries marked with '+'
    :return: Three floats, for accuracy, precision and recall. Accuracy is word-level; precision
    and recall are character level.
    """
    gold_parse_list = []
    test_parse_list = []
    gold_lines = open(filename, 'r', encoding='utf-8').readlines()
    for line in gold_lines:
        for gold_token in line.split():
            gold_parse_list.append(gold_token)
            joined_token = gold_token.replace('+', '')
            test_token = '+'.join([w for w, t in parse_string(joined_token)])
            test_parse_list.append(test_token)
    accuracy, precision, recall = calculate_segment_accuracy(
        gold_parse_list, test_parse_list)
    return accuracy, precision, recall
예제 #4
0
 def test_parse_timing(self):
     test_sent = 'ومن وقتاش رجعت تحكي معاه المدير؟'
     start_time = timeit.default_timer()
     parse_string(test_sent)
     print('Runtime: ',timeit.default_timer() - start_time)
     return
예제 #5
0
 def test_conj_present_verb(self):
     string = 'ونمشيو'
     p = parse_string(string)
     expected_output = [('و', 'C'), ('نمشيو', 'VBZ')]
     self.assertEqual(expected_output, p)
예제 #6
0
 def test_existing_parse(self):
     string = 'باش'
     p = parse_string(string)
     expected_output = [('باش', 'PART')]
     self.assertEqual(expected_output, p)
예제 #7
0
 def test_conj_past_verb(self):
     string = 'وكتبت'
     p = parse_string(string)
     expected_output = [('و', 'C'), ('كتبت', 'VBD')]
     self.assertEqual(expected_output, p)
예제 #8
0
 def test_past_verb_defective(self):
     string = 'مشيت'
     p = parse_string(string)
     expected_output = [('مشيت', 'VBD')]
     self.assertEqual(expected_output, p)
예제 #9
0
 def test_multiple_prefix(self):
     string = 'والكتاب'
     p = parse_string(string)
     expected_output = [('و', 'C'), ('ال', 'DET'), ('كتاب', 'N')]
     self.assertEqual(expected_output, p)
예제 #10
0
 def test_verb_with_shadda(self):
     string = 'يرفّع'
     p = parse_string(string)
     expected_output = [('يرفع', 'VBZ')]
     self.assertEqual(expected_output, p)
예제 #11
0
 def test_particle_with_shadda(self):
     string = 'الّي'
     p = parse_string(string)
     expected_output = [('الي', 'REL')]
     self.assertEqual(expected_output, p)
예제 #12
0
 def test_single_word(self):
     string = 'الكتاب'
     p = parse_string(string)
     expected_output = [('ال', 'DET'), ('كتاب', 'N')]
     self.assertEqual(expected_output, p)