def show_collocations(text): """ Show collocations found in the text. The collocations is looked up from a data base extracted from a corpus. @param text: a given vocalized text. @type text: unicode. @return : the text have collocations quoted @rtype: unicode """ """import tashkeel.tashkeel as ArabicVocalizer vocalizer = ArabicVocalizer.TashkeelClass() vocalized_text = vocalizer.stat_tashkeel(text) return vocalized_text """ import maskouk.collocations as colloc coll = colloc.CollocationClass(True) text = coll.lookup4long_collocations(text) wordlist = araby.tokenize(text) vocalized_list, taglist = coll.lookup(wordlist) #return u" ".join(zip(vocalized_list,taglist)) text_output = u"" opened = False for word, tag in zip(vocalized_list, taglist): if tag in ("CB", "CI"): if not opened: text_output += "<mark class='coll'>" opened = True text_output += word + " " else: if opened: text_output += "</mark>" opened = False text_output += word + " " return text_output
def test_lookup4long(self): """####detect long collocations in a phrase""" mydict = msk.CollocationClass() inpt = u' قلت لهم السلام عليكم ورحمة الله تعالى وبركاته ثم رجعت' output = u' قلت لهم السّلامُ عَلَيكُمْ وَرَحْمَةُ اللهِ تَعَالَى وبركاته ثم رجعت' self.assertEqual(mydict.lookup4long_collocations(inpt), output)
def test_ngramfinder(self): """####Detect collocation in a phrase""" mydict = msk.CollocationClass() text = u"لعبنا مباراة كرة القدم في بيت المقدس" inpt = araby.tokenize(text) output = ['لعبنا', 'مباراة', 'كرة القدم', 'في', 'بيت المقدس'] self.assertEqual(mydict.ngramfinder(2, inpt), output)
def test_is_possible_collocation(self): """####Detect candidate collocations in phrase""" mydict = msk.CollocationClass() text = u"ظهر رئيس الوزراء السيد عبد الملك بن عامر ومعه أمير دولة غرناطة ونهر النيل انطلاق السباق" inputs = [ [['السباق', 'ظهر'], 100], [['ظهر', 'رئيس'], 100], [['رئيس', 'الوزراء'], 100], [['الوزراء', 'السيد'], 20], [['السيد', 'عبد'], 100], [['عبد', 'الملك'], 15], [['الملك', 'بن'], 100], [['بن', 'عامر'], 15], [['عامر', 'ومعه'], 100], [['ومعه', 'أمير'], 100], [['أمير', 'دولة'], 100], [['دولة', 'غرناطة'], 10], [['غرناطة', 'ونهر'], 100], [['ونهر', 'النيل'], 100], [['النيل', 'انطلاق'], 100], [['انطلاق', 'السباق'], 100], ] outputs = [] for wlist, output in inputs: self.assertEqual(mydict.is_possible_collocation(wlist, length=2), output)
def __init__(self): # to display internal messages for debugging #~debug = False # limit of words to vocalize, default value is 1000 words. self.limit = 1000 # set the option value to enable the Last mark on voaclize # words in output # default value is True, can be disabled for debuging porpus self.enabled_last_mark = True # set the option to do statistical vocalization based # on collocations # default value is True, can be disabled for debuging porpus #self.enabled_stat_tashkeel = False self.enabled_stat_tashkeel = True # set the option to show the collocations marks # default value is False, can be enabled for debuging porpus self.enabled_show_collocation_mark = False # set the option to use scoring teashkeel chosing. self.select_by_score_enabled = False # set the option to do syntaxic Analysis # default value is True, can be disabled for debuging porpus self.enabled_syntaxic_analysis = True # set the option to do allow ajusting voaclization result, # for التقاء الساكنين # default value is True, can be disabled for debuging porpus self.enabled_ajust_vocalization = True # set the option to do Semantic Analysis # default value is True, can be disabled for debuging porpus self.enabled_semantic_analysis = True #~ self.enabled_semantic_analysis = False # enable the last mark (Harakat Al-I3rab) self.allow_syntax_last_mark = True # lexical analyzer self.analyzer = qalsadi.analex.Analex() self.analyzer.disable_allow_cache_use() #~ self.analyzer.enable_allow_cache_use() # syntaxic analyzer self.anasynt = aranasyn.anasyn.SyntaxAnalyzer() # semantic analyzer self.anasem = asmai.anasem.SemanticAnalyzer() #set the lexical analzer debugging self.analyzer.set_debug(debug) #set the lexical analzer word limit self.analyzer.set_limit(self.limit) #collocations dictionary for statistical tashkeel self.collo = coll.CollocationClass(self.enabled_show_collocation_mark) # unknown vocalizer for unrecognized words self.unknown_vocalizer = unknown_tashkeel.UnknownTashkeel()
def test_lookup(self): """####Detect collocation in a phrase""" mydict = msk.CollocationClass() text = u"لعبنا مباراة كرة القدم في بيت المقدس" inpt = araby.tokenize(text) output = ([ 'لعبنا', 'مباراة', 'كُرَة', 'الْقَدَمِ', 'في', 'بَيْت', 'الْمَقْدِسِ' ], ['CO', 'CO', 'CB', 'CI', 'CO', 'CB', 'CI']) self.assertEqual(mydict.lookup(inpt), output)
def test_is_collocated(self): """#### Test if collocation exists in database""" mydict = msk.CollocationClass() inpt = ['كرة', 'القدم'] output = u"كرة القدم" self.assertEqual(mydict.is_collocated(inpt), output) inpt = ['شمس', 'النهار'] output = False self.assertEqual(mydict.is_collocated(inpt), output)
def extract_enteties(text): """ Extract enteties as numbers, named enteties, collocations. @param text: a given text. @type text: unicode. @return : the text have enteties phrases quoted @rtype: unicode """ import pyarabic.number import pyarabic.named import maskouk.collocations as colloc coll = colloc.CollocationClass(True) wordlist = araby.tokenize(text) taglist_nb = pyarabic.number.detect_numbers(wordlist) voclist_nb = pyarabic.number.pre_tashkeel_number(wordlist) taglist_nmd = pyarabic.named.detect_named(wordlist) voclist_nmd = pyarabic.named.pretashkeel_named(wordlist) voclist_coll, taglist_coll = coll.lookup(wordlist) # return phrases text_output = [] opened = False for word, tagnb, vocnb, tagnmd, vocnmd, tagcol, voccol in zip( wordlist, taglist_nb, voclist_nb, taglist_nmd, voclist_nmd, taglist_coll, voclist_coll): if tagnb == 'DB': if opened: text_output.append("</mark>") text_output.extend(["<mark class='number'>", vocnb]) opened = True elif tagnmd == 'NB': if opened: text_output.append("</mark>") text_output.extend(["<mark class='named'>", vocnmd]) opened = True elif tagcol == 'CB': if opened: text_output.append("</mark>") text_output.extend(["<mark class='coll'>", voccol]) opened = True elif tagnmd == "NI": text_output.append(vocnmd) elif tagnb == "DI": text_output.append(vocnb) elif tagcol == "CI": text_output.append(voccol) else: if opened: text_output.append("</mark>") opened = False text_output.append(word) if opened: text_output.append("</mark>") return u" ".join(text_output)
def extract_enteties2(text): """ Extract enteties as numbers, named enteties, collocations. @param text: a given text. @type text: unicode. @return : the text have enteties phrases quoted @rtype: unicode """ import pyarabic.number import pyarabic.named import maskouk.collocations as colloc coll = colloc.CollocationClass(True) wordlist = araby.tokenize(text) taglist_nb = pyarabic.number.detect_numbers(wordlist) taglist_nmd = pyarabic.named.detect_named(wordlist) vocalized_list, taglist_coll = coll.lookup(wordlist) # return phrases text_output = "" opened = False for word, voc, tagnb, tagnmd, tagcol in zip(wordlist, vocalized_list, taglist_nb, taglist_nmd, taglist_coll): if tagnb in ('DI', 'DB'): if not opened: text_output += "<mark class='number'>" opened = True text_output += word + " " elif tagnmd in ('NI', 'NB'): if not opened: text_output += "<mark class='named'>" opened = True text_output += word + " " elif tagcol in ('CI', 'CB'): if not opened: text_output += "<mark class='coll'>" opened = True text_output += voc + " " else: if opened: text_output += "</mark>" opened = False text_output += word + " " return text_output
def test_is_collocated_word(self): """####Test if a word has collocations in database""" mydict = msk.CollocationClass() inpt = u"كرة" output = {'القدم': 'كُرَة الْقَدَمِ'} self.assertEqual(mydict.is_collocated_word(inpt), output) inpt = u"بيت" output = { 'العدة': 'بَيْت الْعِدَّةِ', 'المستأجر': 'بَيْت الْمُسْتَأْجِرِ', 'المشتري': 'بَيْتِ الْمُشْتَرِي', 'الرجل': 'بَيْت الرَّجُلِ', 'البناء': 'بَيْت الْبِنَاءِ', 'الزوج': 'بَيْت الزَّوْجِ', 'المال': 'بيت المال', 'المقدس': 'بَيْت الْمَقْدِسِ', 'البائع': 'بَيْت الْبَائِعِ', 'الخلاء': 'بَيْت الْخَلَاءِ', 'الأب': 'بَيْت الْأَبِ', 'الله': 'بَيْت اللّهِ' } self.assertEqual(mydict.is_collocated_word(inpt), output)
def __init__(self, mycache_path=False): # configure logging logging.basicConfig(level=logging.INFO) #~ logging.basicConfig(level=logging.DEBUG) self.logger = logging.getLogger(__name__) #~ self.logger.info("Cache Path %s"%mycache_path) # to display internal messages for debugging #~debug = False # limit of words to vocalize, default value is 1000 words. self.limit = 1000 # set the option value to enable the Last mark on voaclize # words in output # default value is True, can be disabled for debuging porpus self.enabled_last_mark = True # set the option to do statistical vocalization based # on collocations # default value is True, can be disabled for debuging porpus #self.enabled_stat_tashkeel = False self.enabled_stat_tashkeel = True # set the option to show the collocations marks # default value is False, can be enabled for debuging porpus self.enabled_show_collocation_mark = False # set the option to use scoring teashkeel chosing. self.select_by_score_enabled = False # set the option to do syntaxic Analysis # default value is True, can be disabled for debuging porpus self.enabled_syntaxic_analysis = True # set the option to do allow ajusting voaclization result, # for التقاء الساكنين # default value is True, can be disabled for debuging porpus self.enabled_ajust_vocalization = True # set the option to do Semantic Analysis # default value is True, can be disabled for debuging porpus self.enabled_semantic_analysis = True #~ self.enabled_semantic_analysis = False # enable the last mark (Harakat Al-I3rab) self.allow_syntax_last_mark = True # lexical analyzer self.analyzer = qalsadi.analex.Analex(cache_path = mycache_path) #~ self.logger.info("Cache Path cache %s"%self.analyzer.cache.DB_PATH) #~ self.logger.info("Cache Path cache %s"%self.analyzer.cache.db.path) #~ self.analyzer.disable_allow_cache_use() self.analyzer.enable_allow_cache_use() # syntaxic analyzer self.anasynt = aranasyn.anasyn.SyntaxAnalyzer(cache_path = mycache_path) #~ self.logger.info("Cache Path cache syntax %s"%self.anasynt.cache.db.path) # to disable the training when do Tashkeel self.syntax_train_enabled = False # semantic analyzer self.anasem = asmai.anasem.SemanticAnalyzer(cache_path = mycache_path) #~ self.logger.info("Cache Path cache anasem %s"%self.anasem.syncache.db.path) #set the lexical analzer debugging self.analyzer.set_debug(debug) #set the lexical analzer word limit self.analyzer.set_limit(self.limit) #collocations dictionary for statistical tashkeel self.collo = coll.CollocationClass(self.enabled_show_collocation_mark) # unknown vocalizer for unrecognized words self.unknown_vocalizer = unknown_tashkeel.UnknownTashkeel()
def test(): mydict = msk.CollocationClass() word1 = u"كرة" word2 = u"القدم" wlist = [word1, word2] # test if collocation exists print("step1:test if wordlist is collocation") results = mydict.is_collocated(wlist) print("inuput:", wlist) print("output:",results) wlist = [u"شمس", u"النهار"] results = mydict.is_collocated(wlist) print("inuput:", wlist) print("output:",results) # get all collocations for a specific word print("step2:get all collocations for a specific word") results = mydict.is_collocated_word(word1) print(word1, results) print("step3:get all collocations for a specific word") word = u"بيت" # get all collocations for a specific word results = mydict.is_collocated_word(word) print("inuput:", word) print("output:",results) # detect collocations in phrase print("step4: detect collocations in phrase") text = u"لعبنا مباراة كرة القدم في بيت المقدس" wordlist = araby.tokenize(text) results = mydict.ngramfinder(2, wordlist) print("inuput:", text) print("output:",results) # detect collocations in phrase print("step4.1: detect collocations in phrase") text = u"لعبنا مباراة كرة القدم في بيت المقدس" wordlist = araby.tokenize(text) results = mydict.lookup(wordlist) print("inuput:", text) print("output:",results) # get Long collocations print("step5: long collocations") text = u" قلت لهم السلام عليكم ورحمة الله تعالى وبركاته ثم رجعت" results = mydict.lookup4long_collocations(text) print("inuput:", text) print("output:",results) # get Long collocations print("step5-b: long collocations") text = u" قلت لهم السلام عليكم ورحمة الله تعالى وبركاته ثم رجعت" results = mydict.lookup4long_collocations(text) print("inuput:", text) print("output:",results) print("inpt = u'%s'"%text) print("output = u'%s'"%results) # get Long collocations print("step6: detect possible collocations") text = u"ظهر رئيس الوزراء السيد عبد الملك بن عامر ومعه أمير دولة غرناطة ونهر النيل انطلاق السباق" wordlist = araby.tokenize(text) previous = "__" for wrd in wordlist: wlist = [previous, wrd] results = mydict.is_possible_collocation(wlist, lenght = 2) print("inuput:", wlist) print("output:", results) previous = wrd print("[\n") for wrd in wordlist: wlist = [previous, wrd] results = mydict.is_possible_collocation(wlist, lenght = 2) print("[",wlist, ",", results,"],") #~ print("output:", results) previous = wrd print("]")