Ejemplo n.º 1
0
    def check_word_as_pounct(self, word):
        """
        Check if the word is a pounctuation, 
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.        
        """
        detailed_result = []
        if not word:
            return detailed_result
        # ToDo : fix it to isdigit, by moatz saad
        if word.isnumeric():
            detailed_result.append(
                wordcase.WordCase({
                    'word': word,
                    'affix': ('', '', '', ''),
                    'stem': '',
                    'original': word,
                    'vocalized': word,
                    'tags': u"عدد",
                    'type': 'NUMBER',
                    'freq': 0,
                    'syntax': '',
                }))
        # test if all chars in word are punctuation
        for char in word:
            # if one char is not a pounct, break
            if not char in stem_pounct_const.POUNCTUATION:
                break
        else:
            # if all chars are pounct, the word take tags of the first char
            detailed_result.append(
                wordcase.WordCase({
                    'word':
                    word,
                    'affix': ('', '', '', ''),
                    'stem':
                    '',
                    'original':
                    word,
                    'vocalized':
                    word,
                    'tags':
                    stem_pounct_const.POUNCTUATION[word[0]]['tags'],
                    'type':
                    'POUNCT',
                    'freq':
                    0,
                    'syntax':
                    '',
                }))

        return detailed_result
Ejemplo n.º 2
0
    def steming_second_level(self, stop, stop2, procletic, encletic_nm):
        """
        Analyze word morphologically by stemming the conjugation affixes.
        @param stop: the input stop.
        @type stop: unicode.
        @param stop2: the stop stemed from syntaxic affixes.
        @type stop2: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic: the syntaxic suffixe extracted in the fisrt stage.
        @type encletic: unicode.
        @param encletic_nm: the syntaxic suffixe extracted in the 
        first stage (not vocalized).
        @type encletic_nm: unicode.        
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conj_stemmer.segment(stop2)
        # verify affix compatibility
        list_seg_conj = verify_affix(stop2, list_seg_conj,
                                     ssconst.STOPWORDS_CONJUGATION_AFFIX)
        # add vocalized forms of suffixes
        # and create the real affixes from the word
        #~list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            stem_conj = stop2[seg_conj[0]:seg_conj[1]]
            suffix_conj_nm = stop2[seg_conj[1]:]

            # noirmalize hamza before gessing  differents origines
            #~stem_conj = araby.normalize_hamza(stem_conj)

            # generate possible stems
            # add stripped letters to the stem to constitute possible stop list
            possible_stop_list = get_stem_variants(stem_conj, suffix_conj_nm)

            # search the stop in the dictionary
            # we can return the tashkeel
            infstop_form_list = []
            for infstop in set(possible_stop_list):
                # get the stop and get all its forms from the dict
                # if the stop has plural suffix, don't look up in
                #broken plural dictionary
                if not self.cache_dict_search.has_key(infstop):
                    infstop_foundlist = self.stop_dictionary.lookup(infstop)
                    self.cache_dict_search[infstop] = create_dict_word(
                        infstop_foundlist)
                else:
                    infstop_foundlist = self.cache_dict_search[infstop]
                infstop_form_list.extend(infstop_foundlist)
            for stop_tuple in infstop_form_list:
                # stop_tuple = self.stop_dictionary.getEntryById(id)
                original = stop_tuple['vocalized']

                #test if the  given word from dictionary accept those
                # tags given by affixes
                # دراسة توافق الزوائد مع خصائص الاسم،
                # مثلا هل يقبل الاسم التأنيث.
                #~if validate_tags(stop_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm):
                for vocalized_encletic in ssconst.COMP_SUFFIX_LIST_TAGS[
                        encletic_nm]['vocalized']:
                    for vocalized_suffix in ssconst.CONJ_SUFFIX_LIST_TAGS[
                            suffix_conj_nm]['vocalized']:
                        # affixes tags contains prefixes and suffixes tags
                        affix_tags = ssconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
                                  +ssconst.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags'] \
                                  +ssconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']
                        ## verify compatibility between procletics and affix
                        if validate_tags(stop_tuple, affix_tags, procletic, vocalized_encletic , vocalized_suffix)  and \
                        (self.is_compatible_proaffix_affix(stop_tuple, procletic, vocalized_encletic, vocalized_suffix)):

                            vocalized, semi_vocalized = vocalize(
                                original, procletic, vocalized_suffix,
                                vocalized_encletic)
                            vocalized = ajust_vocalization(vocalized)
                            #ToDo:
                            # if the stop word is inflected or not
                            is_inflected = u"مبني" if stop_tuple[
                                'is_inflected'] == 0 else u"معرب"
                            #add some tags from dictionary entry as
                            # use action and object_type
                            original_tags = u":".join([
                                stop_tuple['word_type'],
                                stop_tuple['word_class'],
                                is_inflected,
                                stop_tuple['action'],
                            ])
                            #~print "STOP_TUPEL[action]:", stop_tuple['action'].encode("utf8")
                            # generate word case
                            detailed_result.append(
                                wordcase.WordCase({
                                    'word':
                                    stop,
                                    'affix': (procletic, '', vocalized_suffix,
                                              vocalized_encletic),
                                    'stem':
                                    stem_conj,
                                    'original':
                                    original,
                                    'vocalized':
                                    vocalized,
                                    'semivocalized':
                                    semi_vocalized,
                                    'tags':
                                    u':'.join(affix_tags),
                                    'type':
                                    u':'.join(
                                        ['STOPWORD', stop_tuple['word_type']]),
                                    'freq':
                                    'freqstopword',  # to note the frequency type 
                                    'originaltags':
                                    original_tags,
                                    "action":
                                    stop_tuple['action'],
                                    "object_type":
                                    stop_tuple['object_type'],
                                    "need":
                                    stop_tuple['need'],
                                    'syntax':
                                    '',
                                }))
        return detailed_result
Ejemplo n.º 3
0
    def stemming_verb(self, verb_in):
        """
        Stemming verb
        @param verb_in: given verb
        @type verb_in: unicode
        @return : stemmed words
        @rtype:
        """
        #~ list_found = []
        detailed_result = []
        verb_list = [
            verb_in,
        ] + get_verb_variants(verb_in)

        #list of segmented words
        word_segmented_list = []
        for verb in verb_list:

            list_seg_comp = self.comp_stemmer.segment(verb)
            for seg in list_seg_comp:
                proclitic = verb[:seg[0]]
                stem = verb[seg[0]:seg[1]]
                enclitic = verb[seg[1]:]
                #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8')
                #~secondsuffix = u''
                # حالة الفعل المتعدي لمفعولين
                if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX:
                    firstsuffix = \
                    SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first']
                    enclitic = firstsuffix

                list_stem = [stem] + get_in_stem_variants(stem, enclitic)
                #if enclitic, then transitive is ok
                transitive_comp = bool(enclitic)
                for stm in list_stem:
                    word_seg = {
                        "verb": verb,
                        "pro": proclitic,
                        "enc": enclitic,
                        'stem_comp': stm,
                        'trans_comp': transitive_comp,
                    }
                    word_segmented_list.append(word_seg)

        # second level for segmented word
        tmp_list = []
        #~ print 'first level', verb_in, len(word_segmented_list)
        for word_seg in word_segmented_list:
            verb2 = word_seg['stem_comp']
            # stem reduced verb : level two
            #segment the conjugated verb
            list_seg_conj = self.conj_stemmer.segment(verb2)

            # verify affix compatibility
            list_seg_conj = verify_affix(verb2, list_seg_conj,
                                         SVC.VERBAL_CONJUGATION_AFFIX)
            # verify proclitics and enclitecs
            # verify length pof stem
            for seg_conj in list_seg_conj:
                if (seg_conj[1] - seg_conj[0]) <= 6:

                    #word seg in level 2
                    word_seg_l2 = word_seg.copy()
                    word_seg_l2["prefix"] = verb2[:seg_conj[0]]
                    word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]]
                    word_seg_l2["suffix"] = verb2[seg_conj[1]:]
                    tmp_list.append(word_seg_l2)

        # verify compatibilty between proclitic and affixes
        word_segmented_list = tmp_list
        #~ print 'compatibility', verb_in, len(tmp_list)
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify compatibility between proclitics and affixes
            proclitic = word_seg['pro']
            enclitic = word_seg['enc']
            affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']])
            if self.__check_clitic_affix(proclitic, enclitic, affix_conj):
                tmp_list.append(word_seg.copy())

        #~ print 'stamp', verb_in, len(tmp_list)
        # verify existance of condidate verb by stamp
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify existance of condidate verb by stamp
            if self.verb_dictionary.exists_as_stamp(word_seg['stem_conj']):
                tmp_list.append(word_seg.copy())

        #~ print 'infinitive', verb_in, len(tmp_list)
        # get infinitive of condidate verbs
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # get infinitive of condidate verb by stamp

            # search the verb in the dictionary by stamp
            # if the verb exists in dictionary,
            # The transitivity is consedered
            # if is trilateral return its forms and Tashkeel
            # if not return forms without tashkeel,
            #because the conjugator can vocalized it,
            # we can return the tashkeel if we don't need the
            #conjugation step
            infverb_dict = self.__get_infinitive_verb_by_stem(
                word_seg['stem_conj'], word_seg['trans_comp'])
            #~ print "list possible verbs", len(infverb_dict)
            #~ for item in infverb_dict:
            #~ print item['verb']
            # filter verbs
            infverb_dict = self.__verify_infinitive_verbs(
                word_seg['stem_conj'], infverb_dict)

            for item in infverb_dict:
                #The haraka from is given from the dict
                word_seg_l3 = word_seg.copy()
                word_seg_l3['inf'] = item['verb']
                word_seg_l3['haraka'] = item['haraka']
                word_seg_l3['transitive'] = bool(item['transitive'] in ('y',
                                                                        1))
                tmp_list.append(word_seg_l3)
                # conjugation step

        #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape")
        #~ print 'conj', verb_in, len(tmp_list)
        # get conjugation for every infinitive verb
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # ToDo, conjugate the verb with affix,
            # if exists one verb which match, return it
            # تصريف الفعل مع الزوائد
            # إذا توافق التصريف مع الكلمة الناتجة
            # تعرض النتيجة
            one_correct_conj = self.__generate_possible_conjug(
                word_seg['inf'], word_seg['stem_comp'],
                word_seg['prefix'] + '-' + word_seg['suffix'],
                word_seg['haraka'], word_seg['pro'], word_seg['enc'],
                word_seg['transitive'])

            #~ print "len correct_conj", len(one_correct_conj)
            for conj in one_correct_conj:
                word_seg_l4 = word_seg.copy()
                word_seg_l4['conj'] = conj.copy()
                tmp_list.append(word_seg_l4)

        #~ print 'result', verb_in, len(tmp_list)
        # generate all resulted data
        word_segmented_list = tmp_list

        #~ tmp_list = []
        for word_seg in word_segmented_list:
            conj = word_seg['conj']
            vocalized, semivocalized = vocalize(
                conj['vocalized'], word_seg['pro'], word_seg['enc'])
            tag_type = 'Verb'
            original_tags = "y" if conj['transitive'] else "n"

            detailed_result.append(wordcase.WordCase({
                'word':word_seg['verb'],
                'affix': (word_seg['pro'], word_seg['prefix'], word_seg['suffix'], word_seg['enc']),
                'stem':word_seg['stem_conj'],
                'original':conj['verb'],
                'vocalized':vocalized,
                'semivocalized':semivocalized,
                'tags':u':'.join((conj['tense'], conj['pronoun'])+\
                SVC.COMP_PREFIX_LIST_TAGS[proclitic]['tags']+\
                SVC.COMP_SUFFIX_LIST_TAGS[enclitic]['tags']),#\
                'type':tag_type,
                'number': conj['pronoun_tags'].get('number', ''),
                'gender': conj['pronoun_tags'].get('gender', ''),
                'person': conj['pronoun_tags'].get('person', ''),
                'tense2': conj['tense_tags'].get('tense', ''),
                'voice': conj['tense_tags'].get('voice', ''),
                'mood': conj['tense_tags'].get('mood', ''),
                'confirmed': conj['tense_tags'].get('confirmed', ''),
                'transitive': conj['transitive'],
                'tense': conj['tense'],
                'pronoun': conj['pronoun'],
                'freq':'freqverb',
                'originaltags':original_tags,
                'syntax':'',
            }))

        return detailed_result
Ejemplo n.º 4
0
    def steming_second_level(self, noun, stem_comp, procletic_nm, encletic_nm):
        """
        Analyze word morphologically by stemming the conjugation affixes.
        @param noun: the input noun.
        @type noun: unicode.
        @param stem_comp: the noun stemed from syntaxic affixes.
        @type stem_comp: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic: the syntaxic suffixe extracted in the fisrt stage.
        @type encletic: unicode.
        @param encletic_nm: the syntaxic suffixe extracted in the
        first stage (not vocalized).
        @type encletic_nm: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conj_stemmer.segment(stem_comp)
        # verify affix compatibility
        list_seg_conj = verify_affix(stem_comp, list_seg_conj,
                                     snconst.NOMINAL_CONJUGATION_AFFIX)
        # add vocalized forms of suffixes
        # and create the real affixes from the word
        #~list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            stem_conj = stem_comp[seg_conj[0]:seg_conj[1]]
            suffix_conj_nm = stem_comp[seg_conj[1]:]

            # noirmalize hamza before gessing  differents origines
            stem_conj = ar.normalize_hamza(stem_conj)

            # generate possible stems
            # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = get_stem_variants(stem_conj, suffix_conj_nm)

            # search the noun in the dictionary
            # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in set(possible_noun_list):
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up in
                #broken plural dictionary
                if infnoun not in self.cache_dict_search:
                    infnoun_foundlist = self.noun_dictionary.lookup(infnoun)
                    self.cache_dict_search[infnoun] = infnoun_foundlist
                else:
                    infnoun_foundlist = self.cache_dict_search[infnoun]
                infnoun_form_list.extend(infnoun_foundlist)

            for noun_tuple in infnoun_form_list:
                infnoun = noun_tuple['vocalized']
                # affixes tags contains prefixes and suffixes tags
                affix_tags = list(
                    set(snconst.COMP_PREFIX_LIST_TAGS[procletic_nm]['tags'] +
                        snconst.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] +
                        snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags']))
                #test if the  given word from dictionary accept those
                # tags given by affixes
                # دراسة توافق الزوائد مع خصائص الاسم،
                # مثلا هل يقبل الاسم التأنيث.
                if validate_tags(noun_tuple, affix_tags, procletic_nm,
                                 encletic_nm, suffix_conj_nm):
                    ## get all vocalized form of suffixes
                    for vocalized_encletic in snconst.COMP_SUFFIX_LIST_TAGS[
                            encletic_nm]['vocalized']:
                        for vocalized_suffix in snconst.CONJ_SUFFIX_LIST_TAGS[
                                suffix_conj_nm]['vocalized']:

                            ## verify compatibility between procletics and affix
                            if self.is_compatible_proaffix_affix(
                                    noun_tuple, procletic_nm,
                                    vocalized_encletic, vocalized_suffix):
                                vocalized, semi_vocalized, _ = vocalize(
                                    infnoun, procletic_nm, vocalized_suffix,
                                    vocalized_encletic)

                                #add some tags from dictionary entry as
                                #mamnou3 min sarf and broken plural
                                original_tags = []
                                if noun_tuple['mankous'] == u"Tk":
                                    original_tags.append(u"منقوص")
                                # get affix tags
                                vocalized_affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic_nm]['tags']\
                                  +snconst.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags']\
                                  +snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']
                                # if there are many cases like feminin plural with mansoub and majrour
                                if 'cases' in snconst.CONJ_SUFFIX_LIST_TAGS[
                                        vocalized_suffix]:
                                    list_cases = snconst.CONJ_SUFFIX_LIST_TAGS[
                                        vocalized_suffix]['cases']
                                else:
                                    list_cases = ('', )
                                for case in list_cases:
                                    voc_affix_case = vocalized_affix_tags + (
                                        case, )
                                    detailed_result.append(
                                        wordcase.WordCase({
                                            'word':
                                            noun,
                                            'affix': (procletic_nm, '',
                                                      vocalized_suffix,
                                                      vocalized_encletic),
                                            'stem':
                                            stem_conj,
                                            'original':
                                            infnoun,  #original,
                                            'vocalized':
                                            vocalized,
                                            'semivocalized':
                                            semi_vocalized,
                                            'tags':
                                            u':'.join(voc_affix_case),
                                            'type':
                                            u':'.join([
                                                'Noun', noun_tuple['wordtype']
                                            ]),
                                            'number':
                                            noun_tuple['number'],
                                            'gender':
                                            noun_tuple['gender'],
                                            'freq':
                                            'freqnoun',  # to note the frequency type
                                            'originaltags':
                                            u':'.join(original_tags),
                                            'syntax':
                                            '',
                                        }))
        return detailed_result
Ejemplo n.º 5
0
    def stemming_noun(self, noun_in):
        """
        Analyze word morphologically as noun
        @param noun_in: the input noun.
        @type noun_in: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        debug = False
        #~list_found = []
        detailed_result = []
        noun_list = [
            noun_in,
        ] + get_noun_variants(noun_in)
        word_segmented_list = []
        for noun in noun_list:
            list_seg_comp = self.comp_stemmer.segment(noun)
            # filter
            list_seg_comp = verify_affix(noun, list_seg_comp,
                                         SNC.COMP_NOUN_AFFIXES)
            # treat multi vocalization enclitic
            for seg in list_seg_comp:
                proclitic_nm = noun[:seg[0]]
                stem = noun[seg[0]:seg[1]]
                enclitic_nm = noun[seg[1]:]
                # ajusting nouns variant
                list_stem = [
                    stem,
                ] + get_in_stem_variants(stem, enclitic_nm)

                # stem reduced noun : level two
                for stem in list_stem:
                    word_seg = {
                        'noun': noun,
                        'stem_comp': stem,
                        'pro': proclitic_nm,
                        'enc': enclitic_nm,
                    }
                    word_segmented_list.append(word_seg)
        # level two
        tmp_list = []
        if debug: print("after first level")
        if debug:
            print(
                repr(word_segmented_list).replace(
                    '},', '},\n').decode("unicode-escape"))

        for word_seg in word_segmented_list:

            #~ detailed_result.extend(
            #~ self.steming_second_level(word_seg['noun'], word_seg['stem_comp'],
            #~ word_seg['pro'], word_seg['enc']))
            #~ detailed_result_one = []
            #segment the coinjugated noun
            list_seg_conj = self.conj_stemmer.segment(word_seg['stem_comp'])
            # verify affix compatibility
            # filter
            list_seg_conj = verify_affix(word_seg['stem_comp'], list_seg_conj,
                                         SNC.NOMINAL_CONJUGATION_AFFIX)
            # add vocalized forms of suffixes
            # and create the real affixes from the word
            for seg_conj in list_seg_conj:
                stem_conj = word_seg['stem_comp'][:seg_conj[1]]
                suffix = word_seg['stem_comp'][seg_conj[1]:]
                stem_conj = ar.normalize_hamza(stem_conj)
                stem_conj_list = get_stem_variants(stem_conj, suffix)

                # generate possible stems
                # add stripped letters to the stem to constitute possible noun list
                for stem in stem_conj_list:
                    word_seg_l2 = word_seg.copy()
                    # normalize hamza before gessing  differents origines
                    word_seg_l2['stem_conj'] = stem
                    word_seg_l2['suffix'] = suffix
                    #affixes tags contains prefixes and suffixes tags
                    word_seg_l2['affix_tags'] = list(
                        set(SNC.COMP_PREFIX_LIST_TAGS[word_seg_l2['pro']]
                            ['tags'] + SNC.COMP_SUFFIX_LIST_TAGS[
                                word_seg_l2['enc']]['tags'] +
                            SNC.CONJ_SUFFIX_LIST_TAGS[
                                word_seg_l2['suffix']]['tags']))
                    tmp_list.append(word_seg_l2)

        if debug: print("after second level")
        if debug:
            print(
                repr(tmp_list).replace('},', '},\n').decode("unicode-escape"))
        # lookup in dictionary
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # search the noun in the dictionary
            # we can return the tashkeel
            inf_noun = word_seg['stem_conj']
            # get the noun and get all its forms from the dict
            # if the noun has plural suffix, don't look up in
            #broken plural dictionary
            if inf_noun in self.cache_dict_search:
                infnoun_foundlist = self.cache_dict_search[inf_noun]
            else:
                infnoun_foundlist = self.noun_dictionary.lookup(inf_noun)
                self.cache_dict_search[inf_noun] = infnoun_foundlist

            for noun_tuple in infnoun_foundlist:
                word_seg_l3 = word_seg.copy()
                word_seg_l3["original"] = noun_tuple['vocalized']
                word_seg_l3["noun_tuple"] = dict(noun_tuple)
                tmp_list.append(word_seg_l3)

        if debug: print("after lookup dict")
        if debug:
            print(
                repr(tmp_list).replace('},', '},\n').decode("unicode-escape"))
        # test compatiblity noun_tuple with affixes and proaffixes
        # and generate vocalized affixes and suffixes
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            #test if the  given word from dictionary accept those
            # tags given by affixes
            # دراسة توافق الزوائد مع خصائص الاسم،
            # مثلا هل يقبل الاسم التأنيث.
            if validate_tags(word_seg['noun_tuple'], word_seg['affix_tags'],
                             word_seg['pro'], word_seg['enc'],
                             word_seg['suffix']):
                ## get all vocalized form of suffixes
                for enc_voc in SNC.COMP_SUFFIX_LIST_TAGS[
                        word_seg['enc']]['vocalized']:
                    for suf_voc in SNC.CONJ_SUFFIX_LIST_TAGS[
                            word_seg['suffix']]['vocalized']:
                        ## verify compatibility between proclitics and affix
                        if self.__check_clitic_affix(word_seg['noun_tuple'],
                                                     word_seg['pro'], enc_voc,
                                                     suf_voc):
                            # get affix tags
                            affix_tags_voc = SNC.COMP_PREFIX_LIST_TAGS[word_seg['pro']]['tags']\
                              +SNC.COMP_SUFFIX_LIST_TAGS[enc_voc]['tags']\
                              +SNC.CONJ_SUFFIX_LIST_TAGS[suf_voc]['tags']
                            word_seg_l4 = word_seg.copy()
                            word_seg_l4['suf_voc'] = suf_voc
                            word_seg_l4['enc_voc'] = enc_voc
                            word_seg_l4['affix_tags'] = affix_tags_voc
                            tmp_list.append(word_seg_l4)

        if debug: print("after check compatibility")
        if debug:
            print(
                repr(tmp_list).replace('},', '},\n').decode("unicode-escape"))
        # Generate results
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # get voalized and vocalized without inflection
            vocalized, semi_vocalized, _ = vocalize(
                word_seg['noun_tuple']['vocalized'], word_seg['pro'],
                word_seg['suf_voc'], word_seg['enc_voc'])

            #add some tags from dictionary entry as
            #mamnou3 min sarf and broken plural
            original_tags = []
            if word_seg['noun_tuple']['mankous'] == u"Tk":
                original_tags.append(u"منقوص")
            # if there are many cases like feminin plural with mansoub and majrour
            if 'cases' in SNC.CONJ_SUFFIX_LIST_TAGS[word_seg['suf_voc']]:
                list_cases = SNC.CONJ_SUFFIX_LIST_TAGS[
                    word_seg['suf_voc']]['cases']
            else:
                list_cases = ('', )
            for case in list_cases:
                voc_affix_case = word_seg['affix_tags'] + (case, )
                detailed_result.append(
                    wordcase.WordCase({
                        'word':
                        noun_in,
                        'affix': (word_seg['pro'], '', word_seg['suf_voc'],
                                  word_seg['enc_voc']),
                        'stem':
                        word_seg['stem_conj'],
                        'original':
                        word_seg['noun_tuple']['vocalized'],  #original,
                        'vocalized':
                        vocalized,
                        'semivocalized':
                        semi_vocalized,
                        'tags':
                        u':'.join(voc_affix_case),
                        'type':
                        u':'.join(['Noun',
                                   word_seg['noun_tuple']['wordtype']]),
                        'number':
                        word_seg['noun_tuple']['number'],
                        'gender':
                        word_seg['noun_tuple']['gender'],
                        'freq':
                        'freqnoun',  # to note the frequency type
                        'originaltags':
                        u':'.join(original_tags),
                        'syntax':
                        '',
                    }))
        if debug: print("after generate result")
        if debug: print(len(detailed_result))
        #~ if debug: print repr(detailed_result).replace('},','},\n').decode("unicode-escape")
        return detailed_result
Ejemplo n.º 6
0
    def check_word(self, word, guessedtag = ""):
        """
        Analyze one word morphologically as verbs
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        
        word = araby.strip_tatweel(word)
        word_vocalised = word
        word_nm = araby.strip_tashkeel(word)
        # get analysed details from cache if used
        if self.allow_cache_use and self.cache.isAlreadyChecked(word_nm):
            #~ print (u"'%s'"%word).encode('utf8'), 'found'
            resulted_data = self.cache.getChecked(word_nm)
        else:
            resulted_data = []
            # if word is a pounctuation
            resulted_data += self.check_word_as_pounct(word_nm)
            # Done: if the word is a stop word we have  some problems, 
            # the stop word can also be another normal word (verb or noun), 
            # we must consider it in future works
            # if word is stopword allow stop words analysis
            resulted_data += self.check_word_as_stopword(word_nm)

            #if word is verb
            # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
            #~if  self.tagger.has_verb_tag(guessedtag) or \
            #~self.tagger.is_stopword_tag(guessedtag):
                #~resulted_data += self.check_word_as_verb(word_nm)
            resulted_data += self.check_word_as_verb(word_nm)
                #print "is verb", rabti, len(resulted_data)
            #if word is noun
            #~if self.tagger.has_noun_tag(guessedtag) or \
            #~self.tagger.is_stopword_tag(guessedtag):            
                #~resulted_data += self.check_word_as_noun(word_nm)
            resulted_data += self.check_word_as_noun(word_nm)
            if len(resulted_data) == 0:
                print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8')
                #check the word as unkonwn
                resulted_data += self.check_word_as_unknown(word_nm)
                #check if the word is nomralized and solution are equivalent
            resulted_data = check_normalized(word_vocalised, resulted_data)
            #check if the word is shadda like
            resulted_data = check_shadda(word_vocalised, resulted_data)


            # add word frequency information in tags
            resulted_data = self.add_word_frequency(resulted_data)

            # add the stemmed words details into Cache
            data_list_to_serialize = [w.__dict__ for w in resulted_data]
            if self.allow_cache_use:
                self.cache.addChecked(word_nm, data_list_to_serialize)

        #check if the word is vocalized like results 
        if self.partial_vocalization_support:
            resulted_data = check_partial_vocalized(word_vocalised, resulted_data)

        if len(resulted_data) == 0:
            resulted_data.append(wordcase.WordCase({
            'word':word, 
            'affix': ('' , '', '', ''),     
            'stem':word, 
            'original':word, 
            'vocalized':word, 
            'semivocalized':word, 
            'tags':u'', 
            'type':'unknown', 
            'root':'', 
            'template':'', 
            'freq':self.wordfreq.get_freq(word, 'unknown'), 
            'syntax':'', 
            })
            )
        return resulted_data
Ejemplo n.º 7
0
    def stemming_verb(self, verb):
        """
        Stemming verb
        @param verb: given verb
        @type verb: unicode
        @return : stemmed words
        @rtype:
        """
        list_found = []
        #~display_conj_result = False
        detailed_result = []
        verb = verb.strip()
        verb_list = [
            verb,
        ]
        if verb.startswith(araby.ALEF_MADDA):
            verb_list.append(araby.ALEF_HAMZA_ABOVE + araby.ALEF_HAMZA_ABOVE \
            +verb[1:])
            verb_list.append(araby.HAMZA + araby.ALEF + verb[1:])
        for verb in verb_list:

            list_seg_comp = self.comp_stemmer.segment(verb)
            for seg in list_seg_comp:
                procletic = verb[:seg[0]]
                stem = verb[seg[0]:seg[1]]
                encletic = verb[seg[1]:]
                #~secondsuffix = u''
                # حالة الفعل المتعدي لمفعولين
                if svconst.TableDoubleTransitiveSuffix.has_key(encletic):
                    firstsuffix = \
                    svconst.TableDoubleTransitiveSuffix[encletic]['first']
                    encletic = firstsuffix

                list_stem = [stem]
                if encletic:  #! = "":
                    transitive_comp = True
                    if stem.endswith(araby.TEH + araby.MEEM + araby.WAW):
                        list_stem.append(stem[:-1])
                    elif stem.endswith(araby.WAW):
                        list_stem.append(stem + araby.ALEF)
                    elif stem.endswith(araby.ALEF):
                        list_stem.append(stem[:-1] + araby.ALEF_MAKSURA)

                else:
                    transitive_comp = False
                if verb.startswith(araby.ALEF_MADDA):
                    # االبداية بألف مد
                    list_stem.append(araby.ALEF_HAMZA_ABOVE + \
                    araby.ALEF_HAMZA_ABOVE+verb[1:])
                    list_stem.append(araby.HAMZA + araby.ALEF + verb[1:])
        # stem reduced verb : level two
                result = []
                for verb2 in list_stem:
                    #segment the conjugated verb
                    list_seg_conj = self.conj_stemmer.segment(verb2)

                    # verify affix compatibility
                    list_seg_conj = verify_affix(
                        verb2, list_seg_conj, svconst.VERBAL_CONJUGATION_AFFIX)
                    # verify procletics and enclitecs
                    # verify length pof stem
                    list_seg_conj2 = []
                    for seg_conj in list_seg_conj:
                        #~print "yes22"
                        if (seg_conj[1] - seg_conj[0]) <= 6:
                            prefix_conj = verb2[:seg_conj[0]]
                            stem_conj = verb2[seg_conj[0]:seg_conj[1]]
                            suffix_conj = verb2[seg_conj[1]:]
                            affix_conj = prefix_conj + '-' + suffix_conj
                            #~ print (u' + '.join([prefix_conj,stem_conj,suffix_conj ])).encode('utf8')

                            # verify compatibility between procletics and afix
                            if (is_compatible_proaffix_affix(
                                    procletic, encletic, affix_conj)):
                                #~ print (u' - '.join([prefix_conj,stem_conj,suffix_conj ])).encode('utf8')
                                # verify the existing of a verb stamp in
                                #the dictionary
                                if self.verb_dictionary.exists_as_stamp(
                                        stem_conj):
                                    list_seg_conj2.append(seg_conj)
                                    #~ print ("****"+u' - '.join([prefix_conj,stem_conj,suffix_conj ])).encode('utf8')

                    list_seg_conj = list_seg_conj2
                    list_correct_conj = []

                    for seg_conj in list_seg_conj:
                        prefix_conj = verb2[:seg_conj[0]]
                        stem_conj = verb2[seg_conj[0]:seg_conj[1]]
                        suffix_conj = verb2[seg_conj[1]:]
                        affix_conj = '-'.join([prefix_conj, suffix_conj])

                        # search the verb in the dictionary by stamp
                        # if the verb exists in dictionary,
                        # The transitivity is consedered
                        # if is trilateral return its forms and Tashkeel
                        # if not return forms without tashkeel,
                        #because the conjugator can vocalized it,
                        # we can return the tashkeel if we don't need the
                        #conjugation step
                        #~ print "transitive", transitive_comp
                        infverb_dict = self.get_infinitive_verb_by_stem(
                            stem_conj, transitive_comp)

                        infverb_dict = self.verify_infinitive_verbs(
                            stem_conj, infverb_dict)
                        #                        print (u"\t".join(["verb found", stem_conj, str(len(infverb_dict))])).encode('utf8');

                        for item in infverb_dict:
                            #The haraka from is given from the dict
                            inf_verb = item['verb']
                            haraka = item['haraka']
                            #transtag      =   item['transitive']

                            transitivity = bool(item['transitive'] == 'y')
                            #to fix
                            #print u" ".join(["transtag",inf_verb, transtag, str(transitive)]).encode('utf8')
                            #transitive     =   item['transitive'] == 'y'                             or not item['transitive'])

                            #print (u"\t".join(["verb ", stem_conj, inf_verb])).encode('utf8');
                            #original_tags  =  transtag
                            # dict tag is used to mention word dictionary tags:
                            # the original word tags like transitive attribute
                            unstemed_verb = verb2

                            # conjugation step

                            # ToDo, conjugate the verb with affix,
                            # if exists one verb which match, return it
                            # تصريف الفعل مع الزوائد
                            # إذا توافق التصريف مع الكلمة الناتجة
                            # تعرض النتيجة
                            onelist_correct_conj = []
                            onelist_correct_conj = generate_possible_conjug(
                                inf_verb, unstemed_verb, affix_conj, haraka,
                                procletic, encletic, transitivity)
                            #print (u"* ".join([inf_verb, stem_conj, str(len(onelist_correct_conj))])).encode('utf8')

                            if len(onelist_correct_conj) > 0:
                                list_correct_conj += onelist_correct_conj
                    for conj in list_correct_conj:
                        result.append(conj['verb'])
                        vocalized, semivocalized = vocalize(
                            conj['vocalized'], procletic, encletic)
                        tag_type = 'Verb'
                        original_tags = "y" if conj['transitive'] else "n"

                        #~ print "****", (conj['pronoun_tags'].get('person','مشكلة')).encode('utf8')
                        #~ if conj['verb'] ==u"كَادَ":
                        #~ tag_type = 'Verb:STOPWORD'
                        #~print "stemverb: tense", conj['tense'].encode('utf8')
                        detailed_result.append(wordcase.WordCase({
                        'word':verb,
                        'affix': ( procletic, prefix_conj, suffix_conj,
                        encletic),
                        'stem':stem_conj,
                        'original':conj['verb'],
                        'vocalized':vocalized,
                        'semivocalized':semivocalized,
                        'tags':u':'.join((conj['tense'], conj['pronoun'])+\
                        svconst.COMP_PREFIX_LIST_TAGS[procletic]['tags']+\
                        svconst.COMP_SUFFIX_LIST_TAGS[encletic]['tags']),#\
                        'type':tag_type,
                        'number': conj['pronoun_tags'].get('number',''),
                        'gender': conj['pronoun_tags'].get('gender',''),
                        'person': conj['pronoun_tags'].get('person',''),
                        'tense2': conj['tense_tags'].get('tense',''),
                        'voice': conj['tense_tags'].get('voice',''),
                        'mood': conj['tense_tags'].get('mood',''),
                        'confirmed': conj['tense_tags'].get('confirmed',''),
                        'transitive': conj['transitive'] ,
                        'tense': conj['tense'],
                        'pronoun': conj['pronoun'],
                        'freq':'freqverb',
                        'originaltags':original_tags,
                        'syntax':'',
                        }))

                list_found += result

        list_found = set(list_found)
        return detailed_result
Ejemplo n.º 8
0
    def steming_second_level(self, noun, noun2, procletic, encletic):
        """
        Analyze word morphologically by stemming the conjugation affixes.
        @param noun: the input noun.
        @type noun: unicode.
        @param noun2: the noun stemed from syntaxic affixes.
        @type noun2: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic: the syntaxic suffixe extracted in the fisrt stage.
        @type encletic: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conj_stemmer.segment(noun2)
        # verify affix compatibility
        list_seg_conj = verify_affix(noun2, list_seg_conj,
                                     snconst.NOMINAL_CONJUGATION_AFFIX)

        # add vocalized forms of suffixes
        list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            prefix_conj = noun2[:seg_conj[0]]
            stem_conj = noun2[seg_conj[0]:seg_conj[1]]
            suffix_conj = noun2[seg_conj[1]:]
            #~affix_conj = prefix_conj+'-'+suffix_conj
            # get all vocalized form of suffixes
            for vocalized_suffix in \
            snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']:
                seg_conj_voc = {
                    'prefix': '',
                    'suffix': vocalized_suffix,
                    'stem': stem_conj
                }
                # verify compatibility between procletics and afix
                if (is_compatible_proaffix_affix(procletic, encletic,
                                                 vocalized_suffix)):
                    # verify the existing of a noun stamp in the dictionary
                    # if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
                    # list_seg_conj2.append(seg_conj)
                    list_seg_conj_voc.append(seg_conj_voc)
        list_seg_conj = list_seg_conj_voc
        for seg_conj in list_seg_conj:
            prefix_conj = seg_conj['prefix']
            stem_conj = seg_conj['stem']
            suffix_conj = seg_conj['suffix']
            #~has_plural_suffix = ((u"جمع" in \
            #~snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or\
            #~( u"مثنى" in snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']))
            #print "has_plural", has_plural_suffix
            #~affix_conj = '-'.join([prefix_conj, suffix_conj])
            # noirmalize hamza before gessing  deffirents origines
            stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj)
            # generate possible stems
            # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = get_stem_variants(stem_conj, \
            prefix_conj, suffix_conj)
            # search the noun in the dictionary
            # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in possible_noun_list:
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up
                # in broken plural dictionary
                infnoun_foundlist = self.noun_dictionary.lookup(
                    infnoun, 'unknown')
                infnoun_form_list += infnoun_foundlist
            for noun_tuple in infnoun_form_list:
                # noun_tuple = self.noun_dictionary.getEntryById(id)
                infnoun = noun_tuple['vocalized']
                original_tags = ()
                #~original = noun_tuple['vocalized']
                wordtype = noun_tuple['word_type']
                vocalized = vocalize(infnoun, procletic, prefix_conj,
                                     suffix_conj, encletic)
                #print "v", vocalized.encode('utf8')
                detailed_result.append(wordcase.WordCase({
                    'word':noun,
                    'affix': (procletic, prefix_conj, suffix_conj, encletic),
                    'stem':stem_conj,
                    'original':infnoun, #original,
                    'vocalized':vocalized,
                    'semivocalized':vocalized,
                    'tags':u':'.join(snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags']\
                    +snconst.COMP_SUFFIX_LIST_TAGS[encletic]['tags']+\
                    snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']),
                    'type':u':'.join(['Noun', wordtype]), #'Noun',
                    'freq':noun_tuple['freq'],
                    'originaltags':u':'.join(original_tags),
                    'syntax':'',
                }))

        return detailed_result