def __init__(self, debug=False): # create a stemmer object for stemming enclitics and procletics self.compStemmer=tashaphyne.stemming.ArabicLightStemmer(); # configure the stemmer object self.compStemmer.set_infix_letters(stem_noun_const.COMP_INFIX_LETTERS); self.compStemmer.set_prefix_letters(stem_noun_const.COMP_PREFIX_LETTERS); self.compStemmer.set_suffix_letters(stem_noun_const.COMP_SUFFIX_LETTERS); self.compStemmer.set_max_prefix_length(stem_noun_const.COMP_MAX_PREFIX); self.compStemmer.set_max_suffix_length(stem_noun_const.COMP_MAX_SUFFIX); self.compStemmer.set_min_stem_length(stem_noun_const.COMP_MIN_STEM); self.compStemmer.set_prefix_list(stem_noun_const.COMP_PREFIX_LIST); self.compStemmer.set_suffix_list(stem_noun_const.COMP_SUFFIX_LIST); # create a stemmer object for stemming conjugated verb self.conjStemmer=tashaphyne.stemming.ArabicLightStemmer(); # configure the stemmer object self.conjStemmer.set_infix_letters(stem_noun_const.CONJ_INFIX_LETTERS); self.conjStemmer.set_prefix_letters(stem_noun_const.CONJ_PREFIX_LETTERS); self.conjStemmer.set_suffix_letters(stem_noun_const.CONJ_SUFFIX_LETTERS); self.conjStemmer.set_max_prefix_length(stem_noun_const.CONJ_MAX_PREFIX); self.conjStemmer.set_max_suffix_length(stem_noun_const.CONJ_MAX_SUFFIX); self.conjStemmer.set_min_stem_length(stem_noun_const.CONJ_MIN_STEM); self.conjStemmer.set_prefix_list(stem_noun_const.CONJ_PREFIX_LIST); self.conjStemmer.set_suffix_list(stem_noun_const.CONJ_SUFFIX_LIST); # noun dictionary #self.nounDictionary=arabicdictionary.arabicDictionary("nouns", NOUN_DICTIONARY_INDEX) #word frequency dictionary self.wordfreq= wordfreqdictionaryclass.wordfreqDictionary('wordfreq', wordfreqdictionaryclass.wordfreq_DICTIONARY_INDEX); # use the word frequency dictionary as a dictionary for unkonwn words self.nounDictionary=self.wordfreq; self.debug=debug;
def __init__(self, allowTagGuessing=True, allowDisambiguation=True): """ Create Analex instance. """ self.nounstemmer = stem_noun.nounStemmer() # to stem nouns self.verbstemmer = stem_verb.verbStemmer() # to stem verbs self.unknownstemmer = stem_unknown.unknownStemmer() # to stem unknown self.stopwordsstemmer = stem_stopwords.stopWordStemmer() # to stem stopwords self.allowTagGuessing = allowTagGuessing # allow gueesing tags by naftawayh before analyis # if taggin is disabled, the disambiguation is also disabled self.allowDisambiguation = allowDisambiguation and allowTagGuessing # allow disambiguation before analyis # enable the last mark (Harakat Al-I3rab) self.allowSyntaxLastMark = True if self.allowTagGuessing: self.tagger = naftawayh.wordtag.WordTagger() if self.allowDisambiguation: self.disambiguator = disambig.disambiguator() self.debug = False # to allow to print internal data self.limit = 10000 # limit words in the text self.wordcounter = 0 # the words contain arabic letters and harakat. # the unicode considers arabic harakats as marks not letters, # then we add harakat to the regluar expression to tokenize marks = u"".join( araby.TASHKEEL ) # contains [FATHA,DAMMA,KASRA,SUKUN,DAMMATAN,KASRATAN,FATHATAN,SHADDA]) # used to tokenize arabic text self.token_pat = re.compile(u"([\w%s]+)" % marks, re.UNICODE) #used to split text into clauses self.Clause_pattern = re.compile( u"([\w%s\s]+)" % (u"".join(araby.TASHKEEL), ), re.UNICODE) # allow partial vocalization support, #~The text is analyzed as partial or fully vocalized. self.partial_vocalization_support = True #word frequency dictionary self.wordfreq = wordfreqdictionaryclass.wordfreqDictionary( 'wordfreq', wordfreqdictionaryclass.wordfreq_DICTIONARY_INDEX) # added to avoid duplicated search in the word frequency database # used as cache to reduce database access #added as a global variable to avoid duplucated search in mutliple call of analex # cache used to avoid duplicata self.allowCacheUse = True if self.allowCacheUse: self.cache = cache.cache()
def __init__(self, allowTagGuessing=True, allowDisambiguation=True): """ Create Analex instance. """ self.nounstemmer = stem_noun.nounStemmer() # to stem nouns self.verbstemmer = stem_verb.verbStemmer() # to stem verbs self.unknownstemmer = stem_unknown.unknownStemmer() # to stem unknown self.stopwordsstemmer = stem_stopwords.stopWordStemmer() # to stem stopwords self.allowTagGuessing = allowTagGuessing # allow gueesing tags by naftawayh before analyis # if taggin is disabled, the disambiguation is also disabled self.allowDisambiguation = allowDisambiguation and allowTagGuessing # allow disambiguation before analyis # enable the last mark (Harakat Al-I3rab) self.allowSyntaxLastMark = True if self.allowTagGuessing: self.tagger = naftawayh.wordtag.WordTagger() if self.allowDisambiguation: self.disambiguator = disambig.disambiguator() self.debug = False # to allow to print internal data self.limit = 10000 # limit words in the text self.wordcounter = 0 # the words contain arabic letters and harakat. # the unicode considers arabic harakats as marks not letters, # then we add harakat to the regluar expression to tokenize marks = u"".join(araby.TASHKEEL) # contains [FATHA,DAMMA,KASRA,SUKUN,DAMMATAN,KASRATAN,FATHATAN,SHADDA]) # used to tokenize arabic text self.token_pat = re.compile(u"([\w%s]+)" % marks, re.UNICODE) # used to split text into clauses self.Clause_pattern = re.compile(u"([\w%s\s]+)" % (u"".join(araby.TASHKEEL),), re.UNICODE) # allow partial vocalization support, # ~The text is analyzed as partial or fully vocalized. self.partial_vocalization_support = True # word frequency dictionary self.wordfreq = wordfreqdictionaryclass.wordfreqDictionary( "wordfreq", wordfreqdictionaryclass.wordfreq_DICTIONARY_INDEX ) # added to avoid duplicated search in the word frequency database # used as cache to reduce database access # added as a global variable to avoid duplucated search in mutliple call of analex # cache used to avoid duplicata self.allowCacheUse = True if self.allowCacheUse: self.cache = cache.cache()
def __init__(self, debug=False): # create a stemmer object for stemming enclitics and procletics self.compStemmer = tashaphyne.stemming.ArabicLightStemmer() # configure the stemmer object self.compStemmer.set_infix_letters(stem_noun_const.COMP_INFIX_LETTERS) self.compStemmer.set_prefix_letters( stem_noun_const.COMP_PREFIX_LETTERS) self.compStemmer.set_suffix_letters( stem_noun_const.COMP_SUFFIX_LETTERS) self.compStemmer.set_max_prefix_length(stem_noun_const.COMP_MAX_PREFIX) self.compStemmer.set_max_suffix_length(stem_noun_const.COMP_MAX_SUFFIX) self.compStemmer.set_min_stem_length(stem_noun_const.COMP_MIN_STEM) self.compStemmer.set_prefix_list(stem_noun_const.COMP_PREFIX_LIST) self.compStemmer.set_suffix_list(stem_noun_const.COMP_SUFFIX_LIST) # create a stemmer object for stemming conjugated verb self.conjStemmer = tashaphyne.stemming.ArabicLightStemmer() # configure the stemmer object self.conjStemmer.set_infix_letters(stem_noun_const.CONJ_INFIX_LETTERS) self.conjStemmer.set_prefix_letters( stem_noun_const.CONJ_PREFIX_LETTERS) self.conjStemmer.set_suffix_letters( stem_noun_const.CONJ_SUFFIX_LETTERS) self.conjStemmer.set_max_prefix_length(stem_noun_const.CONJ_MAX_PREFIX) self.conjStemmer.set_max_suffix_length(stem_noun_const.CONJ_MAX_SUFFIX) self.conjStemmer.set_min_stem_length(stem_noun_const.CONJ_MIN_STEM) self.conjStemmer.set_prefix_list(stem_noun_const.CONJ_PREFIX_LIST) self.conjStemmer.set_suffix_list(stem_noun_const.CONJ_SUFFIX_LIST) # noun dictionary self.nounDictionary = arabicdictionary.arabicDictionary( "nouns", NOUN_DICTIONARY_INDEX) #word frequency dictionary self.wordfreq = wordfreqdictionaryclass.wordfreqDictionary( 'wordfreq', wordfreqdictionaryclass.wordfreq_DICTIONARY_INDEX) # self.TriVerbTable_INDEX={}; self.Table_affix_INDEX = {} self.NOUN_DICTIONARY_STAMP = {} # allow to print internal results. self.debug = debug