コード例 #1
0
 def __init__(self, ):
     self.word = u""
     self.verbstemmer = tasha.ArabicLightStemmer()
     # prepare the verb stemmer
     #verb_prefix = u"أسفلونيتا"
     #verb_infix = u"اتويدط"
     #verb_suffix = u"امتةكنهوي"
     #verb_max_prefix = 4
     #verb_max_suffix = 6
     #self.verbstemmer.set_max_prefix_length(verb_max_prefix)
     #self.verbstemmer.set_max_suffix_length(verb_max_suffix)
     #self.verbstemmer.set_prefix_letters(verb_prefix)
     #self.verbstemmer.set_suffix_letters(verb_suffix)
     self.verbstemmer.set_prefix_list(affix_const.VERBAL_PREFIX_LIST)
     #self.verbstemmer.infix_letters = verb_infix
     # prepare the noun stemmer
     self.nounstemmer = tasha.ArabicLightStemmer()
     #noun_prefix = u"مأسفلونيتاكب"
     #noun_infix = u"اتويدط"
     #noun_suffix = u"امتةكنهوي"
     #self.nounstemmer.set_prefix_letters(noun_prefix)
     #self.nounstemmer.set_suffix_letters(noun_suffix)
     self.nounstemmer.set_prefix_list(affix_const.NOMINAL_PREFIXES_LIST)
     #self.nounstemmer.infix_letters = noun_infix
     self.cache = {}  # a cache to speed up the tagging process
コード例 #2
0
def main():
    """ Get Data Set"""
    DATA_FILE = 'samples/NAFIS_gold_standard.xml'
    try:
        xmldoc = open(DATA_FILE)
    except:
        print "Can't Open the file, first test", DATA_FILE
        sys.exit()
    try:
        xmldoc = minidom.parse(DATA_FILE)
    except:
        print "Can't Open the file", DATA_FILE
        sys.exit()
    word_dict = display_word_seg(xmldoc)
    #~ print repr(word_dict).replace('}','}\n').decode('unicode-escape')
    
    
    # test Tashaphyne
    stmer = stemming.ArabicLightStemmer()
    word_dict_tasha = {}
    total_score = 0
    total_seg_tasha = 0
    total_seg_nafis = 0

    scores = {}
    for word in word_dict.keys():
        stmer.segment(word)
        stmer.light_stem(word)
        segmentation = stmer.get_affix_list()
        word_dict_tasha[word] = segmentation
        print "*"*50
        print word.encode('utf8')
        print (arepr(word_dict[word]))
        print "-"*50
        print (arepr(segmentation))
        
        #~ score = compare(word_dict[word], segmentation)
        score = included(word_dict[word], segmentation)
        scores[word] = int(score)
        total_score += score
        total_seg_tasha += len(segmentation)
        total_seg_nafis += len(word_dict[word])
        print word.encode('utf8'), score
    print "-----scores --------"
    for k in scores:
        print k.encode('utf8'), scores[k]
    print "total_score", total_score
    print "total_seg_tasha", total_seg_tasha
    print "total_seg_nafis", total_seg_nafis       
コード例 #3
0
# syntaxic
print "StopWord Syntaxic Prefixes"
myprint(sconst.COMP_PREFIX_LIST)
print "StopWord Syntaxic Suffixes"
myprint(sconst.COMP_SUFFIX_LIST)

# morpholpgic
print "StopWord Morphologic Prefixes"
myprint(sconst.CONJ_PREFIX_LIST)
print "StopWord Morphologic Suffixes"
myprint(sconst.CONJ_SUFFIX_LIST)

# print a customized automaton

import tashaphyne.stemming as tast
stemmer = tast.ArabicLightStemmer()

print "build tashaphyne automaton"
print "Tashaphyne prefixe automaton"
print_automate(stemmer.prefixes_tree)
print "Tashaphyne suffixe automaton"
print_automate(stemmer.suffixes_tree)

print "build verb automaton"
stemmer.set_prefix_list(vconst.COMP_PREFIX_LIST)
stemmer.set_suffix_list(vconst.COMP_SUFFIX_LIST)

print "prefixe automaton"
print_automate(stemmer.prefixes_tree)
print "suffixe automaton"
print_automate(stemmer.suffixes_tree)
コード例 #4
0
 def __init__(self):
     self.stem = stemming.ArabicLightStemmer()
     self.lemmer = qalsadi.lemmatizer.Lemmatizer()
コード例 #5
0
from pyarabic.araby import tokenize, strip_tashkeel
from arabicstopwords.arabicstopwords import is_stop
import qalsadi.lemmatizer

from tashaphyne import stemming

stem = stemming.ArabicLightStemmer()
lemmer = qalsadi.lemmatizer.Lemmatizer()


def remove_tashkeel(text):
    return strip_tashkeel(text)


def tokenize_text(text):
    return tokenize(text)


def lemmatize_text(text):
    return lemmer.lemmatize_text(text)


def remove_stop_word(text: list):
    non_stop_words = []
    for word in text:
        if not is_stop(word):
            non_stop_words.append(word)
    return non_stop_words


def process_text_lemm(text):
コード例 #6
0
sys.path.append('/opt/mishkal/lib');
sys.path.append('../lib');
# join the actual dirctory to lib path
# print os.path.join(os.path.dirname(sys.argv[0]), 'lib');
sys.path.append(os.path.join(os.path.dirname(sys.argv[0]), 'lib'));
# sys.exit();
import pyarabic.araby as araby
import tashaphyne.stemming as stemmer
# all tashkeel and special chars
#symbols is a list
symbols=araby.TASHKEEL+araby.WEAK + (araby.TEH_MARBUTA, araby.ALEF_HAMZA_BELOW);

NON_TASHKEEL_pattern =re.compile(ur"[^"+u''.join(symbols)+u"]", re.UNICODE)
NON_VOCALIZATION_pattern =re.compile(ur"[^"+u''.join(araby.TASHKEEL)+u"]", re.UNICODE)

analyzer=stemmer.ArabicLightStemmer()

def extractHarakat(word, joker=araby.TATWEEL):
	"""
	Extract all harakats from the word, all other letters will be replaced with a joker
	"""
	harakatPattern = re.sub(NON_TASHKEEL_pattern,joker, word);
	return harakatPattern
def extractPattern(word, joker=araby.TATWEEL):
	"""
	Extract all harakats from the word, all other letters will be replaced with a joker
	"""
	starword, left, right= analyzer.transformToStars(word);
	# harakatPattern = re.sub(NON_TASHKEEL_pattern,joker, word);
	starword = re.sub(NON_TASHKEEL_pattern,joker, starword);
	#print newword.encode('utf8');