def test_real_sentences(self): string = 'ومن وقتاش رجعت تحكي معاه المدير؟' p = parse_string(string) expected_output = [('و', 'C'), ('من', 'N'), ('وقتاش', 'INTEROG'), ('رجعت', 'VBD'), ('تحكي', 'VBZ'), ('معا', 'N'), ('ه', 'PRO'), ('ال', 'DET'), ('مدير', 'N'), ('؟', 'PUNCT')] self.assertEqual(expected_output, p)
def test_multiword_string(self): string = 'هذه العربية و هذه الmixed!' p = parse_string(string) expected_output = [('هذه', 'DEM'), ('ال', 'DET'), ('عربية', 'N'), ('و', 'C'), ('هذه', 'DEM'), ('ال', 'N'), ('mixed', 'FW'), ('!', 'PUNCT')] # 'N' instead of 'DET' for second 'ال' is an expected failure, since an # isolated determiner is not an expected word type self.assertEqual(expected_output, p)
def evaluate_parser_segmentation(filename='data/segmentation_gold.txt'): """ Evaluates results of word segmentation. :param filename: A txt file with arabic text with morphologic boundaries marked with '+' :return: Three floats, for accuracy, precision and recall. Accuracy is word-level; precision and recall are character level. """ gold_parse_list = [] test_parse_list = [] gold_lines = open(filename, 'r', encoding='utf-8').readlines() for line in gold_lines: for gold_token in line.split(): gold_parse_list.append(gold_token) joined_token = gold_token.replace('+', '') test_token = '+'.join([w for w, t in parse_string(joined_token)]) test_parse_list.append(test_token) accuracy, precision, recall = calculate_segment_accuracy( gold_parse_list, test_parse_list) return accuracy, precision, recall
def test_parse_timing(self): test_sent = 'ومن وقتاش رجعت تحكي معاه المدير؟' start_time = timeit.default_timer() parse_string(test_sent) print('Runtime: ',timeit.default_timer() - start_time) return
def test_conj_present_verb(self): string = 'ونمشيو' p = parse_string(string) expected_output = [('و', 'C'), ('نمشيو', 'VBZ')] self.assertEqual(expected_output, p)
def test_existing_parse(self): string = 'باش' p = parse_string(string) expected_output = [('باش', 'PART')] self.assertEqual(expected_output, p)
def test_conj_past_verb(self): string = 'وكتبت' p = parse_string(string) expected_output = [('و', 'C'), ('كتبت', 'VBD')] self.assertEqual(expected_output, p)
def test_past_verb_defective(self): string = 'مشيت' p = parse_string(string) expected_output = [('مشيت', 'VBD')] self.assertEqual(expected_output, p)
def test_multiple_prefix(self): string = 'والكتاب' p = parse_string(string) expected_output = [('و', 'C'), ('ال', 'DET'), ('كتاب', 'N')] self.assertEqual(expected_output, p)
def test_verb_with_shadda(self): string = 'يرفّع' p = parse_string(string) expected_output = [('يرفع', 'VBZ')] self.assertEqual(expected_output, p)
def test_particle_with_shadda(self): string = 'الّي' p = parse_string(string) expected_output = [('الي', 'REL')] self.assertEqual(expected_output, p)
def test_single_word(self): string = 'الكتاب' p = parse_string(string) expected_output = [('ال', 'DET'), ('كتاب', 'N')] self.assertEqual(expected_output, p)