def parse_string(string): """ Morphologically segments and POS tags string of Tunisian Arabic text. :param string: Tunisian Arabic text :return: list of parse tuples in the format ('word', 'POS') """ # print("string") saved_parses = load_saved_parses() tokens = preprocess(string) # print("after preprocessing: ", tokens) parsed_list = [] for word in tokens: if word in saved_parses.keys(): # print(word, "in saved keys") parsed_list.append(saved_parses[word]) continue if not word.isalpha(): # print(word, "is not alpha") parsed_list.append((word, 'PUNCT')) continue if test_lang(word) != 'AR': # print(word, "is not arabic") parsed_list.append((word, 'FW')) continue # print("none of three conditions is true") # print("word is still ", word) parse_dict = stemmer(word) # print(parse_dict) parse, pos = choose_best_parse(parse_dict, debug=False) pos = re.sub('UNINVBD', 'VBD', pos) pos = re.sub('UNIN', 'N', pos) # default to noun for uninflected unknown words pos_list = pos.split('_') mapped = list(zip(parse, pos_list)) parsed_list.extend(mapped) return parsed_list
def test_ave_vbz_freq(self): vbz = 'يكتبوا' parse_dict = stemmer(vbz) parse = parse_dict.get('VBZ') stem, word_forms = make_alt_verb_forms(parse) ave_freq = compute_ave_freq(word_forms) self.assertNotEqual(ave_freq, 0)
def test_ave_unin_vbd_freq(self): vbd = 'كتب' parse_dict = stemmer(vbd) parse = parse_dict.get('UNIN') stem, word_forms = make_alt_unin_verb_forms(parse) ave_freq = compute_ave_freq(word_forms) self.assertNotEqual(ave_freq, 0)
def test_ave_noun_freq(self): noun = 'الكتاب' parse_dict = stemmer(noun) parse = parse_dict.get('DET_N') stem, word_forms = make_alt_noun_forms(parse) ave_freq = compute_ave_freq(word_forms) self.assertNotEqual(ave_freq, 0)
def make_alt_verb_forms(self): vbd = 'كتبت' parse_dict = stemmer(vbd) parse = parse_dict.get('VBD') stem, word_forms = make_alt_verb_forms(parse) ave_freq = compute_ave_freq(word_forms) self.assertNotEqual(ave_freq, 0)
def test_make_unin_verb_forms(self): verb = 'كتب' parse_dict = stemmer(verb) parse = parse_dict.get('UNIN') stem, verb_forms = make_alt_unin_verb_forms(parse) all_verb_forms = [ 'يكتب', 'يكتبوا', 'يكتبو', 'تكتب', 'تكتبوا', 'تكتبو', 'نكتب', 'نكتبوا', 'نكتبو', 'كتبت', 'كتبنا', 'كتبو', 'كتبوا' ] self.assertEqual(sorted(verb_forms), sorted(all_verb_forms))
def test_make_alt_noun_forms(self): n = 'الكتاب' parse_dict = stemmer(n) parse = parse_dict.get('DET_N') stem, word_forms = make_alt_noun_forms(parse) self.assertEqual( sorted(word_forms), sorted([ 'كتابك', 'كتابكم', 'كتابنا', 'كتابه', 'كتابها', 'كتابهم', 'كتابو', 'كتابي', 'كتابيا', 'لكتاب', 'كتاب' ]))
def test_make_alt_vbd_forms(self): vbd = 'كتبت' parse_dict = stemmer(vbd) parse = parse_dict.get('VBD') stem, word_forms = make_alt_verb_forms(parse) self.assertEqual( sorted(word_forms), sorted([ 'كتبنا', 'كتبو', 'كتبوا', 'يكتب', 'يكتبوا', 'يكتبو', 'تكتب', 'تكتبوا', 'تكتبو', 'نكتب', 'نكتبوا', 'نكتبو', 'كتب' ]))
def choose_best_stem_test(word, debug=False): parse_dict = stemmer(word) if debug: print("Parse dict is", parse_dict) best_parse, word_type = choose_best_parse(parse_dict, debug=debug) stem = extract_stem(best_parse) return stem
def test_verb_suffix_defined(self): parse_dict = stemmer(self.vbd) parse = parse_dict.get('VBD') self.assertEqual('ت', extract_suffix(parse))
def test_returns_dict_with_correct_key(self): parse_dict = stemmer(self.vbd) self.assertIsNotNone(parse_dict.get('VBD')) for k in parse_dict.keys(): self.assertIsInstance(k, str)
def test_simple_stemmer(self): for w, num in self.test_words: parse_list = stemmer(w) self.assertEqual(num, len(parse_list))