Example #1
0
	def __init__(self, debug=False):
		# create a stemmer object for stemming enclitics and procletics
		self.compStemmer=tashaphyne.stemming.ArabicLightStemmer();
		# configure the stemmer object
		self.compStemmer.set_infix_letters(stem_noun_const.COMP_INFIX_LETTERS);
		self.compStemmer.set_prefix_letters(stem_noun_const.COMP_PREFIX_LETTERS);
		self.compStemmer.set_suffix_letters(stem_noun_const.COMP_SUFFIX_LETTERS);
		self.compStemmer.set_max_prefix_length(stem_noun_const.COMP_MAX_PREFIX);
		self.compStemmer.set_max_suffix_length(stem_noun_const.COMP_MAX_SUFFIX);
		self.compStemmer.set_min_stem_length(stem_noun_const.COMP_MIN_STEM);
		self.compStemmer.set_prefix_list(stem_noun_const.COMP_PREFIX_LIST);
		self.compStemmer.set_suffix_list(stem_noun_const.COMP_SUFFIX_LIST);

		# create a stemmer object for stemming conjugated verb
		self.conjStemmer=tashaphyne.stemming.ArabicLightStemmer();
		# configure the stemmer object
		self.conjStemmer.set_infix_letters(stem_noun_const.CONJ_INFIX_LETTERS);
		self.conjStemmer.set_prefix_letters(stem_noun_const.CONJ_PREFIX_LETTERS);
		self.conjStemmer.set_suffix_letters(stem_noun_const.CONJ_SUFFIX_LETTERS);
		self.conjStemmer.set_max_prefix_length(stem_noun_const.CONJ_MAX_PREFIX);
		self.conjStemmer.set_max_suffix_length(stem_noun_const.CONJ_MAX_SUFFIX);
		self.conjStemmer.set_min_stem_length(stem_noun_const.CONJ_MIN_STEM);
		self.conjStemmer.set_prefix_list(stem_noun_const.CONJ_PREFIX_LIST);
		self.conjStemmer.set_suffix_list(stem_noun_const.CONJ_SUFFIX_LIST);
		# noun dictionary
		#self.nounDictionary=arabicdictionary.arabicDictionary("nouns", NOUN_DICTIONARY_INDEX)
		#word frequency dictionary
		self.wordfreq= wordfreqdictionaryclass.wordfreqDictionary('wordfreq', wordfreqdictionaryclass.wordfreq_DICTIONARY_INDEX);
		# use the word frequency dictionary as a dictionary for unkonwn words
		self.nounDictionary=self.wordfreq;

		self.debug=debug;
Example #2
0
    def __init__(self, allowTagGuessing=True, allowDisambiguation=True):
        """
		Create Analex instance.
		"""

        self.nounstemmer = stem_noun.nounStemmer()
        # to stem nouns
        self.verbstemmer = stem_verb.verbStemmer()
        # to stem verbs
        self.unknownstemmer = stem_unknown.unknownStemmer()
        # to stem unknown
        self.stopwordsstemmer = stem_stopwords.stopWordStemmer()
        # to stem stopwords

        self.allowTagGuessing = allowTagGuessing  # allow gueesing tags by naftawayh before analyis
        # if taggin is disabled, the disambiguation is also disabled
        self.allowDisambiguation = allowDisambiguation and allowTagGuessing  # allow disambiguation before analyis
        # enable the last mark (Harakat Al-I3rab)
        self.allowSyntaxLastMark = True
        if self.allowTagGuessing:
            self.tagger = naftawayh.wordtag.WordTagger()
        if self.allowDisambiguation:
            self.disambiguator = disambig.disambiguator()
        self.debug = False
        # to allow to print internal data
        self.limit = 10000
        # limit words in the text
        self.wordcounter = 0
        # the words contain arabic letters and harakat.
        # the unicode considers arabic harakats as marks not letters,
        # then we add harakat to the regluar expression to tokenize
        marks = u"".join(
            araby.TASHKEEL
        )  # contains [FATHA,DAMMA,KASRA,SUKUN,DAMMATAN,KASRATAN,FATHATAN,SHADDA])
        # used to tokenize arabic text
        self.token_pat = re.compile(u"([\w%s]+)" % marks, re.UNICODE)
        #used to split text into clauses
        self.Clause_pattern = re.compile(
            u"([\w%s\s]+)" % (u"".join(araby.TASHKEEL), ), re.UNICODE)

        # allow partial vocalization support,
        #~The text is analyzed as partial or fully vocalized.
        self.partial_vocalization_support = True

        #word frequency dictionary
        self.wordfreq = wordfreqdictionaryclass.wordfreqDictionary(
            'wordfreq', wordfreqdictionaryclass.wordfreq_DICTIONARY_INDEX)

        # added to avoid duplicated search in the word frequency database
        # used as cache to reduce database access
        #added as a global variable to avoid duplucated search in mutliple call of analex
        # cache used to avoid duplicata
        self.allowCacheUse = True
        if self.allowCacheUse:
            self.cache = cache.cache()
Example #3
0
    def __init__(self, allowTagGuessing=True, allowDisambiguation=True):
        """
		Create Analex instance.
		"""

        self.nounstemmer = stem_noun.nounStemmer()
        # to stem nouns
        self.verbstemmer = stem_verb.verbStemmer()
        # to stem verbs
        self.unknownstemmer = stem_unknown.unknownStemmer()
        # to stem unknown
        self.stopwordsstemmer = stem_stopwords.stopWordStemmer()
        # to stem stopwords

        self.allowTagGuessing = allowTagGuessing  # allow gueesing tags by naftawayh before analyis
        # if taggin is disabled, the disambiguation is also disabled
        self.allowDisambiguation = allowDisambiguation and allowTagGuessing  # allow disambiguation before analyis
        # enable the last mark (Harakat Al-I3rab)
        self.allowSyntaxLastMark = True
        if self.allowTagGuessing:
            self.tagger = naftawayh.wordtag.WordTagger()
        if self.allowDisambiguation:
            self.disambiguator = disambig.disambiguator()
        self.debug = False
        # to allow to print internal data
        self.limit = 10000
        # limit words in the text
        self.wordcounter = 0
        # the words contain arabic letters and harakat.
        # the unicode considers arabic harakats as marks not letters,
        # then we add harakat to the regluar expression to tokenize
        marks = u"".join(araby.TASHKEEL)  # contains [FATHA,DAMMA,KASRA,SUKUN,DAMMATAN,KASRATAN,FATHATAN,SHADDA])
        # used to tokenize arabic text
        self.token_pat = re.compile(u"([\w%s]+)" % marks, re.UNICODE)
        # used to split text into clauses
        self.Clause_pattern = re.compile(u"([\w%s\s]+)" % (u"".join(araby.TASHKEEL),), re.UNICODE)

        # allow partial vocalization support,
        # ~The text is analyzed as partial or fully vocalized.
        self.partial_vocalization_support = True

        # word frequency dictionary
        self.wordfreq = wordfreqdictionaryclass.wordfreqDictionary(
            "wordfreq", wordfreqdictionaryclass.wordfreq_DICTIONARY_INDEX
        )

        # added to avoid duplicated search in the word frequency database
        # used as cache to reduce database access
        # added as a global variable to avoid duplucated search in mutliple call of analex
        # cache used to avoid duplicata
        self.allowCacheUse = True
        if self.allowCacheUse:
            self.cache = cache.cache()
Example #4
0
    def __init__(self, debug=False):
        # create a stemmer object for stemming enclitics and procletics
        self.compStemmer = tashaphyne.stemming.ArabicLightStemmer()
        # configure the stemmer object
        self.compStemmer.set_infix_letters(stem_noun_const.COMP_INFIX_LETTERS)
        self.compStemmer.set_prefix_letters(
            stem_noun_const.COMP_PREFIX_LETTERS)
        self.compStemmer.set_suffix_letters(
            stem_noun_const.COMP_SUFFIX_LETTERS)
        self.compStemmer.set_max_prefix_length(stem_noun_const.COMP_MAX_PREFIX)
        self.compStemmer.set_max_suffix_length(stem_noun_const.COMP_MAX_SUFFIX)
        self.compStemmer.set_min_stem_length(stem_noun_const.COMP_MIN_STEM)
        self.compStemmer.set_prefix_list(stem_noun_const.COMP_PREFIX_LIST)
        self.compStemmer.set_suffix_list(stem_noun_const.COMP_SUFFIX_LIST)
        # create a stemmer object for stemming conjugated verb
        self.conjStemmer = tashaphyne.stemming.ArabicLightStemmer()
        # configure the stemmer object
        self.conjStemmer.set_infix_letters(stem_noun_const.CONJ_INFIX_LETTERS)
        self.conjStemmer.set_prefix_letters(
            stem_noun_const.CONJ_PREFIX_LETTERS)
        self.conjStemmer.set_suffix_letters(
            stem_noun_const.CONJ_SUFFIX_LETTERS)
        self.conjStemmer.set_max_prefix_length(stem_noun_const.CONJ_MAX_PREFIX)
        self.conjStemmer.set_max_suffix_length(stem_noun_const.CONJ_MAX_SUFFIX)
        self.conjStemmer.set_min_stem_length(stem_noun_const.CONJ_MIN_STEM)
        self.conjStemmer.set_prefix_list(stem_noun_const.CONJ_PREFIX_LIST)
        self.conjStemmer.set_suffix_list(stem_noun_const.CONJ_SUFFIX_LIST)
        # noun dictionary
        self.nounDictionary = arabicdictionary.arabicDictionary(
            "nouns", NOUN_DICTIONARY_INDEX)
        #word frequency dictionary
        self.wordfreq = wordfreqdictionaryclass.wordfreqDictionary(
            'wordfreq', wordfreqdictionaryclass.wordfreq_DICTIONARY_INDEX)

        #		self.TriVerbTable_INDEX={};
        self.Table_affix_INDEX = {}
        self.NOUN_DICTIONARY_STAMP = {}
        # allow to print internal results.
        self.debug = debug