def __init__(self, ): self.word = u"" self.verbstemmer = tasha.ArabicLightStemmer() # prepare the verb stemmer #verb_prefix = u"أسفلونيتا" #verb_infix = u"اتويدط" #verb_suffix = u"امتةكنهوي" #verb_max_prefix = 4 #verb_max_suffix = 6 #self.verbstemmer.set_max_prefix_length(verb_max_prefix) #self.verbstemmer.set_max_suffix_length(verb_max_suffix) #self.verbstemmer.set_prefix_letters(verb_prefix) #self.verbstemmer.set_suffix_letters(verb_suffix) self.verbstemmer.set_prefix_list(affix_const.VERBAL_PREFIX_LIST) #self.verbstemmer.infix_letters = verb_infix # prepare the noun stemmer self.nounstemmer = tasha.ArabicLightStemmer() #noun_prefix = u"مأسفلونيتاكب" #noun_infix = u"اتويدط" #noun_suffix = u"امتةكنهوي" #self.nounstemmer.set_prefix_letters(noun_prefix) #self.nounstemmer.set_suffix_letters(noun_suffix) self.nounstemmer.set_prefix_list(affix_const.NOMINAL_PREFIXES_LIST) #self.nounstemmer.infix_letters = noun_infix self.cache = {} # a cache to speed up the tagging process
def main(): """ Get Data Set""" DATA_FILE = 'samples/NAFIS_gold_standard.xml' try: xmldoc = open(DATA_FILE) except: print "Can't Open the file, first test", DATA_FILE sys.exit() try: xmldoc = minidom.parse(DATA_FILE) except: print "Can't Open the file", DATA_FILE sys.exit() word_dict = display_word_seg(xmldoc) #~ print repr(word_dict).replace('}','}\n').decode('unicode-escape') # test Tashaphyne stmer = stemming.ArabicLightStemmer() word_dict_tasha = {} total_score = 0 total_seg_tasha = 0 total_seg_nafis = 0 scores = {} for word in word_dict.keys(): stmer.segment(word) stmer.light_stem(word) segmentation = stmer.get_affix_list() word_dict_tasha[word] = segmentation print "*"*50 print word.encode('utf8') print (arepr(word_dict[word])) print "-"*50 print (arepr(segmentation)) #~ score = compare(word_dict[word], segmentation) score = included(word_dict[word], segmentation) scores[word] = int(score) total_score += score total_seg_tasha += len(segmentation) total_seg_nafis += len(word_dict[word]) print word.encode('utf8'), score print "-----scores --------" for k in scores: print k.encode('utf8'), scores[k] print "total_score", total_score print "total_seg_tasha", total_seg_tasha print "total_seg_nafis", total_seg_nafis
# syntaxic print "StopWord Syntaxic Prefixes" myprint(sconst.COMP_PREFIX_LIST) print "StopWord Syntaxic Suffixes" myprint(sconst.COMP_SUFFIX_LIST) # morpholpgic print "StopWord Morphologic Prefixes" myprint(sconst.CONJ_PREFIX_LIST) print "StopWord Morphologic Suffixes" myprint(sconst.CONJ_SUFFIX_LIST) # print a customized automaton import tashaphyne.stemming as tast stemmer = tast.ArabicLightStemmer() print "build tashaphyne automaton" print "Tashaphyne prefixe automaton" print_automate(stemmer.prefixes_tree) print "Tashaphyne suffixe automaton" print_automate(stemmer.suffixes_tree) print "build verb automaton" stemmer.set_prefix_list(vconst.COMP_PREFIX_LIST) stemmer.set_suffix_list(vconst.COMP_SUFFIX_LIST) print "prefixe automaton" print_automate(stemmer.prefixes_tree) print "suffixe automaton" print_automate(stemmer.suffixes_tree)
def __init__(self): self.stem = stemming.ArabicLightStemmer() self.lemmer = qalsadi.lemmatizer.Lemmatizer()
from pyarabic.araby import tokenize, strip_tashkeel from arabicstopwords.arabicstopwords import is_stop import qalsadi.lemmatizer from tashaphyne import stemming stem = stemming.ArabicLightStemmer() lemmer = qalsadi.lemmatizer.Lemmatizer() def remove_tashkeel(text): return strip_tashkeel(text) def tokenize_text(text): return tokenize(text) def lemmatize_text(text): return lemmer.lemmatize_text(text) def remove_stop_word(text: list): non_stop_words = [] for word in text: if not is_stop(word): non_stop_words.append(word) return non_stop_words def process_text_lemm(text):
sys.path.append('/opt/mishkal/lib'); sys.path.append('../lib'); # join the actual dirctory to lib path # print os.path.join(os.path.dirname(sys.argv[0]), 'lib'); sys.path.append(os.path.join(os.path.dirname(sys.argv[0]), 'lib')); # sys.exit(); import pyarabic.araby as araby import tashaphyne.stemming as stemmer # all tashkeel and special chars #symbols is a list symbols=araby.TASHKEEL+araby.WEAK + (araby.TEH_MARBUTA, araby.ALEF_HAMZA_BELOW); NON_TASHKEEL_pattern =re.compile(ur"[^"+u''.join(symbols)+u"]", re.UNICODE) NON_VOCALIZATION_pattern =re.compile(ur"[^"+u''.join(araby.TASHKEEL)+u"]", re.UNICODE) analyzer=stemmer.ArabicLightStemmer() def extractHarakat(word, joker=araby.TATWEEL): """ Extract all harakats from the word, all other letters will be replaced with a joker """ harakatPattern = re.sub(NON_TASHKEEL_pattern,joker, word); return harakatPattern def extractPattern(word, joker=araby.TATWEEL): """ Extract all harakats from the word, all other letters will be replaced with a joker """ starword, left, right= analyzer.transformToStars(word); # harakatPattern = re.sub(NON_TASHKEEL_pattern,joker, word); starword = re.sub(NON_TASHKEEL_pattern,joker, starword); #print newword.encode('utf8');