Exemple #1
0
def main(args):

    generator = alyahmor.genelex.genelex()
    print('NOUN_AFFIX_LIST=')
    noun_affixes = generator.generate_noun_affix_list()
    print(arepr(noun_affixes).replace(',', ',\n'))

    print('VERB_AFFIX_LIST=')
    verb_affixes = generator.generate_verb_affix_list()
    print(arepr(verb_affixes).replace(',', ',\n'))
    return 0
Exemple #2
0
 def test_affix():
     generator = alyahmor_genelex.genelex()
     word = u"قصد"
     wtype = "verb"
     list_forms = generator.generate_affix_list(word_type=wtype,
                                                indexed=True)
     print(arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
     wtype = "noun"
     print('********* Noun ************')
     list_forms = generator.generate_affix_list(word_type=wtype,
                                                indexed=True)
     print(arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
Exemple #3
0
 def add_footer(self):
     """close the data set, used for ending xml, or sql"""
     
     text = "STOPWORDS="
     text += arepr(self.STOPWORDS).decode('utf8')
     if self.generate_all_forms : 
         text += "\n\nSTOPWORDS_INDEX="     
         text += arepr(self.STOPWORDS_INDEX).decode('utf8')
     # add newlines for more readability
     text = text.replace('}],', '}],\n')
     text = text.replace('],', '],\n')            
     text = text.replace('),', '),\n')            
     
     return text
 def log(self, data, msg=""):
     """ display internal data"""
     if not self.debug:
         return False
     else:
         print(msg)
         print(arepr(data))
Exemple #5
0
def main(args):
    df = pd.read_csv(
        "samples/Arabic-patterns/Arabic-patterns-tabbed-v2.txt",
        encoding='utf8',
        delimiter='\t',
    )
    outfile = "output/Arabic-patterns-tabbed.csv"
    # preprocess columns
    for name in df.columns.values:
        #~ print name
        df[name] = df[name].apply(preprocess)
    # convert trans
    df["pattern"] = df["pattern"].apply(arabtrans.tim2utf8)
    df.to_csv(outfile + "debug", sep='\t', encoding='utf-8')
    df["singularPattern"] = df["singularPattern"].apply(arabtrans.tim2utf8)
    df['rhyzome'] = df['pattern'].apply(extract_rhyzome)
    df['unvocalized'] = df['pattern'].apply(araby.strip_tashkeel)
    df['weak'] = df['rhyzome'].apply(classify_rhyzome)
    df['weak'] = df['rhyzome'].apply(classify_rhyzome)

    print(df.head())
    #~ generate_rooton_list()
    df.sort_values(by=['rhyzome'], ascending=True, inplace=True)
    df2 = df[[
        'rhyzome', 'pattern', 'weak', 'examples'
    ]]  #, 'singularPattern', 'type', 'nType', 'vType', 'isBrokenPlural', 'hasBrokenPlural', 'hasFem', 'subOf','examples']
    df2 = df[[
        'rhyzome', 'unvocalized', 'pattern', 'weak', 'examples'
    ]]  #, 'singularPattern', 'type', 'nType', 'vType', 'isBrokenPlural', 'hasBrokenPlural', 'hasFem', 'subOf','examples']

    df2.to_csv(outfile, sep='\t', encoding='utf-8')
    rhyzomes = list(df['rhyzome'].unique())
    print(arepr(rhyzomes))
    return 0
Exemple #6
0
def main(args):
    word = u"قَصْدٌ"    
    noun_forms = generate_noun_forms(word)
    #~ print(arepr(noun_forms).replace('),', '),\n'))
    #~ print('************verb*****')
    word = u"قصد"    
    verb_forms =generate_verb_forms(word)
    #~ print(arepr(verb_forms).replace('),', '),\n'))
    
    print ('NOUN_AFFIX_LIST=')
    noun_affixes = generate_noun_affix_list()
    print(arepr(noun_affixes).replace(',', ',\n'))
    
    print('VERB_AFFIX_LIST=')
    verb_affixes = generate_verb_affix_list()
    print(arepr(verb_affixes).replace(',', ',\n'))
    return 0
Exemple #7
0
 def test_generate_one(tuple_list):
     generator = alyahmor_genelex.genelex()
     for word, wtype, affixes in tuple_list:
         affixes = affixes.split("-")
         list_forms = generator.generate_by_affixes(word,
                                                    word_type=wtype,
                                                    affixes=affixes)
         print(
             arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
Exemple #8
0
def test_rooter3(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    asl = abstractstemmer.customStemmer_roots_rhyzome()
    # debug in rhyzome rooter
    asl.rootdict.rhyzome_rooter.debug = True
    df = dataframe_result
    # avoid null roots

    total = len(df.index)
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        root_list = root.split(';')
        if not is_stop(word):
            word = re.sub(u"[%s]" % (araby.ALEF_MADDA),
                          araby.HAMZA + araby.ALEF, word)
            asl.light_stem(word)
            default_root = asl.get_root()
            starword = asl.get_starword()
            asl.segment(word)
            affixa_list = asl.get_affix_list()
            # filter valid affixes
            affixa_list = filter(asl.verify_affix, affixa_list)
            #~ root_result = rootslib.choose_root(affixation_list)
            if True:
                stems = [d['stem'] for d in affixa_list]
                roots = [d['root'] for d in affixa_list]
                print((u"**********%s*********" % word).encode('utf8'))
                print((u"Start Word : %s" % starword).encode('utf8'))
                print("Stems: " + u' '.join(stems).encode('utf8'))
                print((u"Dafault roots: [%s] a %s" %
                       (default_root, u' '.join(roots))).encode('utf8'))
                print(arepr(affixa_list))

            root_result = asl.rootdict.choose_root(word,
                                                   affixa_list,
                                                   debug=True)
        else:
            root_result = stop_root(word)
            roots = []
            stems = []
            startword = ""
            default_root = ""
            affixa_list = []
        if root_result in root_list:
            cpt += 1
        if True:
            print((u" ".join([
                u"Test root", root, u"found root", root_result,
                str(root_result in root_list)
            ])).encode('utf8'))

    print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
Exemple #9
0
def test():
    # readfile
    filename = "samples/majdi-patterns.csv"
    outfile = "output/majdi-patterns.csv"
    try:
        df = pd.read_csv(filename, delimiter='\t',
          #~ names=['word', 'root', 'lemma', 'type','non'], 
          encoding = "utf-8",
          #~ skiprows=1,
          )
    except:
        print " Can't Open the given File ", filename;
        sys.exit();
    print(df.head())
    df['rhyzome'] = df['Pattern'].apply(extract_rhyzome)
    print('**********after rhyzome******')
    print(df.head())
    print(df.head(100))
        # save file on csv
    df.to_csv(outfile, sep='\t', encoding='utf-8')
    rhyzomes = list(df['rhyzome'].unique())
    # filter some rhyzomes
    # avoid some patterns
    rhyzomes = [r for r in rhyzomes if not( araby.HEH in r or araby.MEEM in r or araby.NOON in r)]

    print(u"****rhyzomes****")
    print(u"\n".join(rhyzomes).encode('utf8'))
    print(len(rhyzomes))
    reduced = []
    for r in rhyzomes:
        reduced.extend(make_weak_rhyzome(r))
    reduced = list(set(reduced))
    reduced = [x for x in reduced if x not in rhyzomes]
    print(u"****reduced****")
    print(u"\n".join(reduced).encode('utf8'))
    print(len(reduced))
    rhyzomes.extend(reduced)
    print('In wazns not in Rhyzomes')
    diff2 = [x for x in WAZNS if x not in rhyzomes]    
    print(arepr(diff2))    
    print('RHYZOMES=')
    print(arepr(rhyzomes))
Exemple #10
0
def main(args):

    generator = genelex.genelex()
    print('NOUN_AFFIX_LIST='),
    noun_affixes = generator.generate_affix_list(word_type="noun",
                                                 vocalized=False)
    print(arepr(noun_affixes).replace(',', ',\n'))

    print('VERB_AFFIX_LIST='),
    verb_affixes = generator.generate_affix_list(word_type="verb",
                                                 vocalized=False)
    print(arepr(verb_affixes).replace(',', ',\n'))

    # print prefixes and affixes

    noun_prefixes, noun_suffixes = generator.generate_prefix_suffix_list(
        word_type="noun", vocalized=False)
    print('NOUN_PREFIX_LIST='),
    print(arepr(noun_prefixes).replace(',', ',\n'))
    print('NOUN_SUFFIX_LIST='),
    print(arepr(noun_suffixes).replace(',', ',\n'))

    verb_prefixes, verb_suffixes = generator.generate_prefix_suffix_list(
        word_type="verb", vocalized=False)

    print('VERB_PREFIX_LIST='),
    print(arepr(verb_prefixes).replace(',', ',\n'))
    print('VERB_SUFFIX_LIST='),
    print(arepr(verb_suffixes).replace(',', ',\n'))

    return 0
Exemple #11
0
    def test(tuple_list):
        generator = alyahmor_genelex.genelex()

        for word, wtype in tuple_list:
            print('************%s*****' % wtype)
            list_forms = generator.generate_forms(word, word_type=wtype)
            print(
                arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
            list_forms = generator.generate_forms(word,
                                                  word_type=wtype,
                                                  vocalized=False)
            print(
                arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
            list_forms = generator.generate_forms(word,
                                                  word_type=wtype,
                                                  indexed=True)
            print(
                arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
            list_forms = generator.generate_affix_list(word_type=wtype,
                                                       indexed=True)
            print(
                arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
Exemple #12
0
 def print_tuple(self, vdict):
     """
     convert tuple to string
     """
     if type(vdict) is list:
         print arepr(vdict)
     if "text" in vdict:
         return vdict['text']
     else:
         return u'\t'.join([
         vdict["word"],
         vdict["triliteral"],
         vdict["root"],
         vdict["future_type"],
         vdict["transitive"],
         str(vdict["nb_trans"]),
         vdict["object_type"],
         vdict["reflexive_type"],
         vdict["tenses"],
         vdict["model"],
         str(vdict["nb_case"]),
         vdict["verb_cat"],
         vdict["suggest"]])
Exemple #13
0
def main(args):
    args = grabargs()
    filename = args.filename
    outfile = args.outfile
    try:
        myfile = open(filename)
    except:
        print("Can't Open file %s" % filename)
        sys.exit()
    lines = myfile.readlines()
    debug = True
    limit = 500
    generator = alyahmor.genelex.genelex()
    #~ words = araby.tokenize(text)
    tuple_list = [l.decode('utf8').strip().split('\t') for l in lines]
    for word, wtype in tuple_list:
        if wtype == "noun":
            print('************Noun*****')
            noun_forms = generator.generate_noun_forms(word)
            print(arepr(noun_forms).replace('),', '),\n'))
        if wtype == "verb":
            print('************verb*****')
            verb_forms = generator.generate_verb_forms(word)
            print(arepr(verb_forms).replace('),', '),\n'))
Exemple #14
0
    def stemming_verb(self, verb_in):
        """
        Stemming verb
        @param verb_in: given verb
        @type verb_in: unicode
        @return : stemmed words
        @rtype:
        """
        if not verb_in:
            return None
        #~ list_found = []
        detailed_result = []
        verb_list = [
            verb_in,
        ] + self.get_verb_variants(verb_in)
        verb_list = list(set(verb_list))
        debug = self.debug
        #list of segmented words
        word_segmented_list = []
        for verb in verb_list:

            list_seg_comp = self.comp_stemmer.segment(verb)
            for seg in list_seg_comp:
                proclitic = verb[:seg[0]]
                stem = verb[seg[0]:seg[1]]
                enclitic = verb[seg[1]:]
                #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8')
                #~secondsuffix = u''
                # حالة الفعل المتعدي لمفعولين
                if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX:
                    firstsuffix = \
                    SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first']
                    enclitic = firstsuffix

                list_stem = [stem] + self.get_in_stem_variants(stem, enclitic)
                #if enclitic, then transitive is ok
                transitive_comp = bool(enclitic)
                for stm in list_stem:
                    word_seg = {
                        "verb": verb,
                        "pro": proclitic,
                        "enc": enclitic,
                        'stem_comp': stm,
                        'trans_comp': transitive_comp,
                    }
                    word_segmented_list.append(word_seg)
        if debug: print("after first level")
        if debug:
            #~ print(repr(word_segmented_list).replace(
            #~ '},', '},\n').decode("unicode-escape"))
            print(arepr(verb_in))
            print(print_table(word_segmented_list))
        # second level for segmented word
        tmp_list = []
        #~ print 'first level', verb_in, len(word_segmented_list)
        for word_seg in word_segmented_list:
            verb2 = word_seg['stem_comp']
            # stem reduced verb : level two
            #segment the conjugated verb
            list_seg_conj = self.conj_stemmer.segment(verb2)

            # verify affix compatibility
            list_seg_conj = self.verify_affix(verb2, list_seg_conj,
                                              SVC.VERBAL_CONJUGATION_AFFIX)
            # verify proclitics and enclitecs
            # verify length pof stem
            for seg_conj in list_seg_conj:
                if (seg_conj[1] - seg_conj[0]) <= 6:

                    #word seg in level 2
                    word_seg_l2 = word_seg.copy()
                    word_seg_l2["prefix"] = verb2[:seg_conj[0]]
                    word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]]
                    word_seg_l2["suffix"] = verb2[seg_conj[1]:]
                    tmp_list.append(word_seg_l2)

        # verify compatibilty between proclitic and affixes
        word_segmented_list = tmp_list
        #~ print 'compatibility', verb_in, len(tmp_list)
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify compatibility between proclitics and affixes
            proclitic = word_seg['pro']
            enclitic = word_seg['enc']
            affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']])
            if self.__check_clitic_affix(proclitic, enclitic, affix_conj):
                tmp_list.append(word_seg.copy())

        #~ print 'stamp', verb_in, len(tmp_list)
        # verify existance of condidate verb by stamp
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify existance of condidate verb by stamp
            if self.exists_as_stamp(word_seg['stem_conj']):
                tmp_list.append(word_seg.copy())

        if debug: print("after second level")
        if debug:
            print(arepr(verb_in))
            print(print_table(tmp_list))
        #~ print 'infinitive', verb_in, len(tmp_list)
        # get infinitive of condidate verbs
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # get infinitive of condidate verb by stamp

            # search the verb in the dictionary by stamp
            # if the verb exists in dictionary,
            # The transitivity is consedered
            # if is trilateral return its forms and Tashkeel
            # if not return forms without tashkeel,
            #because the conjugator can vocalized it,
            # we can return the tashkeel if we don't need the
            #conjugation step
            infverb_dict = self.__get_infinitive_verb_by_stem(
                word_seg['stem_conj'], word_seg['trans_comp'])
            if debug: print("infinitive candidat verbs")
            if debug:
                print(arepr(verb_in))
                print(print_table(infverb_dict))
            #~ print "list possible verbs", len(infverb_dict)
            #~ for item in infverb_dict:
            #~ print item['verb']
            # filter verbs
            infverb_dict = self.__verify_infinitive_verbs(
                word_seg['stem_conj'], infverb_dict)

            if debug: print("valid infinitive candidat verbs")
            if debug:
                print(arepr(verb_in))
                print(print_table(infverb_dict))
            for item in infverb_dict:
                #The haraka from is given from the dict
                word_seg_l3 = word_seg.copy()
                word_seg_l3['inf'] = item['verb']
                word_seg_l3['haraka'] = item['haraka']
                word_seg_l3['root'] = item.get('root', '')
                word_seg_l3['transitive'] = bool(item['transitive'] in ('y',
                                                                        1))
                tmp_list.append(word_seg_l3)
                # conjugation step
        if debug: print("after lookup dict")
        if debug:
            print(arepr(verb_in))
            print(print_table(tmp_list))
        #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape")
        #~ print 'conj', verb_in, len(tmp_list)
        # get conjugation for every infinitive verb
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # ToDo, conjugate the verb with affix,
            # if exists one verb which match, return it
            # تصريف الفعل مع الزوائد
            # إذا توافق التصريف مع الكلمة الناتجة
            # تعرض النتيجة

            one_correct_conj = self.__generate_possible_conjug(
                word_seg['inf'], word_seg['stem_comp'],
                word_seg['prefix'] + '-' + word_seg['suffix'],
                word_seg['haraka'], word_seg['pro'], word_seg['enc'],
                word_seg['transitive'])

            #~ print "len correct_conj", len(one_correct_conj)
            for conj in one_correct_conj:
                word_seg_l4 = word_seg.copy()
                word_seg_l4['conj'] = conj.copy()
                tmp_list.append(word_seg_l4)
        if debug: print("after generating conjugation")
        if debug:
            print(arepr(verb_in))
            conjs = [item['conj'] for item in tmp_list]
            print(print_table(conjs))
        #~ print 'result', verb_in, len(tmp_list)
        # generate all resulted data
        word_segmented_list = tmp_list

        #~ tmp_list = []
        for word_seg in word_segmented_list:
            conj = word_seg['conj']
            #~ vocalized, semivocalized = self.vocalize(
            vocal_tuple_list = self.vocalize(conj['vocalized'],
                                             word_seg['pro'], word_seg['enc'])
            tag_type = 'Verb'
            original_tags = "y" if conj['transitive'] else "n"
            # ~ print("stem_verb", vocal_tuple_list)
            for vocalized, semivocalized, __ in vocal_tuple_list:
                # ~ for XXX in vocal_tuple_list:
                # prepare tags
                tags = self.prepare_tags(conj, proclitic, enclitic)

                detailed_result.append(
                    wordcase.WordCase({
                        'word':
                        word_seg['verb'],
                        'affix': (word_seg['pro'], word_seg['prefix'],
                                  word_seg['suffix'], word_seg['enc']),
                        'stem':
                        word_seg['stem_conj'],
                        'root':
                        ar.normalize_hamza(word_seg.get('root', '')),
                        'original':
                        conj['verb'],
                        'vocalized':
                        vocalized,
                        'semivocalized':
                        semivocalized,
                        'tags':
                        tags,  #\
                        'type':
                        tag_type,
                        'number':
                        conj['pronoun_tags'].get('number', ''),
                        'gender':
                        conj['pronoun_tags'].get('gender', ''),
                        'person':
                        conj['pronoun_tags'].get('person', ''),
                        'tense2':
                        conj['tense_tags'].get('tense', ''),
                        'voice':
                        conj['tense_tags'].get('voice', ''),
                        'mood':
                        conj['tense_tags'].get('mood', ''),
                        'confirmed':
                        conj['tense_tags'].get('confirmed', ''),
                        'transitive':
                        conj['transitive'],
                        'tense':
                        conj['tense'],
                        'pronoun':
                        conj['pronoun'],
                        'freq':
                        'freqverb',
                        'originaltags':
                        original_tags,
                        'syntax':
                        '',
                    }))

        return detailed_result
Exemple #15
0
             u"جاء مليونان وألفان وإثنا عشر",
             u"وجدت خمسمئة وثلاث وعشرون دينارا",
             u"خمسمئة وثلاث وعشرون دينارا",
             u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
             u"لم أجد شيئا",
             u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
             u'من ثلاثمئة وخمسين بلدا ',
             u'من ثلاثمئة وخمسين بلدا ',
             u'من أربعمئة وخمسين بلدا ',
             u'السلام عليكم 2014',
            ]
    #~ arepr = arabrepr.ArabicRepr()
    for txt in TEXTS:
        word_list = araby.tokenize(txt)
        positions_phrases = detect_number_phrases_position(word_list)
        print(positions_phrases)
        nb_phrases = extract_number_phrases(txt)
        tag_list = detect_numbers(word_list)
        print(tag_list)
        print(u" ".join(word_list))
        print(zip(tag_list, word_list))
        print('tashkeel')
        tashkeel = u" ".join(pre_tashkeel_number(word_list))
        if sys.version_info < (3, 0):
            print(tashkeel.encode('utf8'))
        else:
            print(tashkeel)
        print(u'\t'.join(nb_phrases))
        print("detect number word")
        print(arabrepr.arepr(detect_number_words(txt)))
Exemple #16
0
        return []


def main(args):
    word = u"لعلهم"
    print stop_stem(word)
    return 0


if __name__ == '__main__':
    import sys
    from pyarabic.arabrepr import arepr
    words = [
        (u'منكم', True),
        (u'ممكن', False),
        (u'عندما', True),
        (u'حينئذ', True),
    ]
    for w, rep in words:
        result = is_stop(w)
        if result != rep:
            print((u"Error %s is %swhere must be %s" %
                   (w, result, rep)).encode('utf8'))

    print(len(stopwords_list()))
    print(len(classed_stopwords_list()))
    print(arepr(stopword_forms(u'حتى')))
    print(arepr(stopword_forms(u'جميع')))
    print(arepr(stop_stem(u'لجميعهم')))
    print(arepr(stop_stem(u'لجم')))
Exemple #17
0
        targets = [x.strip() for x in item[1:] if x.strip() ]
        targets = [araby.strip_tashkeel(x.strip()) for x in targets if x]
        word_list = araby.tokenize(text1)
        tag_list2 = chunker.detect_chunks(word_list)
        result = chunker.extract_chunks(text1)

        equal, inequal = eval_score(targets, result)
        print("Equal",equal, inequal)        
        tests['correct'] += equal
        tests['incorrect'] += inequal
        
        if inequal and debug:
            # debug 
            print("ID"+str(key), text1.encode('utf8'))
            print("result")
            print(arepr(result))
            print("target")
            print(arepr(targets))
            #~ result2 = chunker.detect_chunks(word_list)        
            #~ print(arepr(result2))        
            result2 = chunker.detect_positions(word_list, debug=True)        
            print(arepr(result2))            
        
        #~ tuples = (zip(tag_list2, word_list))
        #~ for tup in tuples:
            #~ print(repr(tup).decode('unicode-escape').encode('utf8'))

   
    # tests
    for item in test_texts:
        text1 = item[0]
Exemple #18
0
 def test2(tuple_list):
     generator = alyahmor_genelex.genelex()
     for word, wtype in tuple_list:
         list_forms = generator.generate_forms(word, word_type=wtype)
         print(
             arepr(list_forms).replace('),', '),\n').replace('],', '],\n'))
Exemple #19
0
 def display_all(self):
     """ display all contents of data base """
     #~ pass
     print "aranasyn.cache: dislay all records in Thaalib Database """
     for curr in self.db.all('a', with_doc=True):
         print curr['doc']['a'], arepr(curr['doc']['d'])
Exemple #20
0
             u"جاء مليونان وألفان وإثنا عشر",
             u"وجدت خمسمئة وثلاث وعشرون دينارا",
             u"خمسمئة وثلاث وعشرون دينارا",
             u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
             u"لم أجد شيئا",
             u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
             u'من ثلاثمئة وخمسين بلدا ',
             u'من ثلاثمئة وخمسين بلدا ',
             u'من أربعمئة وخمسين بلدا ',
             u'السلام عليكم 2014',
            ]
    #~ arepr = arabrepr.ArabicRepr()
    for txt in TEXTS:
        word_list = araby.tokenize(txt)
        positions_phrases = detect_number_phrases_position(word_list)
        print(positions_phrases)
        nb_phrases = extract_number_phrases(txt)
        tag_list = detect_numbers(word_list)
        print(tag_list)
        print(u" ".join(word_list))
        print(zip(tag_list, word_list))
        print('tashkeel')
        tashkeel = u" ".join(pre_tashkeel_number(word_list))
        if sys.version_info < (3, 0):
            print(tashkeel.encode('utf8'))
        else:
            print(tashkeel)
        print(u'\t'.join(nb_phrases))
        print("detect number word")
        print(arabrepr.arepr(detect_number_words(txt)))
Exemple #21
0
    def stemming_noun(self, noun_in):
        """
        Analyze word morphologically as noun
        @param noun_in: the input noun.
        @type noun_in: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        self.set_error_code('')
        if not noun_in:
            self.set_error_code('Empty word')
            return None
        debug = self.debug
        #~list_found = []
        detailed_result = []
        noun_list = [
            noun_in,
        ] + self.get_noun_variants(noun_in)
        word_segmented_list = []
        for noun in noun_list:
            list_seg_comp = self.comp_stemmer.segment(noun)
            # filter
            list_seg_comp = self.verify_affix(noun, list_seg_comp,
                                              SNC.COMP_NOUN_AFFIXES)
            # treat multi vocalization enclitic
            for seg in list_seg_comp:
                proclitic_nm = noun[:seg[0]]
                stem = noun[seg[0]:seg[1]]
                enclitic_nm = noun[seg[1]:]
                # ajusting nouns variant
                list_stem = [
                    stem,
                ] + self.get_input_stem_variants(stem, enclitic_nm)

                # stem reduced noun : level two
                for stem in list_stem:
                    word_seg = {
                        'noun': noun,
                        'stem_comp': stem,
                        'pro': proclitic_nm,
                        'enc': enclitic_nm,
                    }
                    word_segmented_list.append(word_seg)
        if not word_segmented_list:
            self.set_error_code(" First level segmentation error")

        # level two

        tmp_list = []
        if debug: print("after first level")
        if debug:
            #~ print(repr(word_segmented_list).replace(
            #~ '},', '},\n').decode("unicode-escape"))
            print(arepr(noun_in))
            print(print_table(word_segmented_list))

        for word_seg in word_segmented_list:

            #~ detailed_result.extend(
            #~ self.steming_second_level(word_seg['noun'], word_seg['stem_comp'],
            #~ word_seg['pro'], word_seg['enc']))
            #~ detailed_result_one = []
            #segment the coinjugated noun
            list_seg_conj = self.conj_stemmer.segment(word_seg['stem_comp'])
            # verify affix compatibility
            # filter
            list_seg_conj = self.verify_affix(word_seg['stem_comp'],
                                              list_seg_conj,
                                              SNC.NOMINAL_CONJUGATION_AFFIX)
            # add vocalized forms of suffixes
            # and create the real affixes from the word
            for seg_conj in list_seg_conj:
                stem_conj = word_seg['stem_comp'][:seg_conj[1]]
                suffix = word_seg['stem_comp'][seg_conj[1]:]
                stem_conj = ar.normalize_hamza(stem_conj)
                stem_conj_list = self.get_stem_variants(stem_conj, suffix)

                # generate possible stems
                # add stripped letters to the stem to constitute possible noun list
                for stem in stem_conj_list:
                    word_seg_l2 = word_seg.copy()
                    # normalize hamza before gessing  differents origines
                    word_seg_l2['stem_conj'] = stem
                    word_seg_l2['suffix'] = suffix
                    #affixes tags contains prefixes and suffixes tags
                    word_seg_l2['affix_tags'] = list(
                        set(SNC.COMP_PREFIX_LIST_TAGS[word_seg_l2['pro']]
                            ['tags'] + SNC.COMP_SUFFIX_LIST_TAGS[
                                word_seg_l2['enc']]['tags'] +
                            SNC.CONJ_SUFFIX_LIST_TAGS[
                                word_seg_l2['suffix']]['tags']))
                    tmp_list.append(word_seg_l2)

        if debug: print("after second level")
        if debug:
            print(arepr(noun_in))
            print(print_table(tmp_list))
        # lookup in dictionary
        if not tmp_list:
            self.set_error_code(" Second level segmentation error")
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # search the noun in the dictionary
            # we can return the tashkeel
            inf_noun = word_seg['stem_conj']
            # get the noun and get all its forms from the dict
            # if the noun has plural suffix, don't look up in
            #broken plural dictionary
            if inf_noun in self.cache_dict_search:
                infnoun_foundlist = self.cache_dict_search[inf_noun]
            else:
                infnoun_foundlist = self.lookup_dict(inf_noun)
                self.cache_dict_search[inf_noun] = infnoun_foundlist

            for noun_tuple in infnoun_foundlist:
                word_seg_l3 = word_seg.copy()
                word_seg_l3["original"] = noun_tuple['vocalized']
                word_seg_l3["noun_tuple"] = dict(noun_tuple)
                tmp_list.append(word_seg_l3)

        if debug: print("after lookup dict")
        if debug:
            print(arepr(noun_in))
            noun_tuples = [item['noun_tuple'] for item in tmp_list]
            print(print_table(noun_tuples))
        # test compatiblity noun_tuple with affixes and proaffixes
        # and generate vocalized affixes and suffixes
        if not tmp_list:
            self.set_error_code("Not exists in dictionary")
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            #test if the  given word from dictionary accept those
            # tags given by affixes
            # دراسة توافق الزوائد مع خصائص الاسم،
            # مثلا هل يقبل الاسم التأنيث.
            if self.validate_tags(word_seg['noun_tuple'],
                                  word_seg['affix_tags'], word_seg['pro'],
                                  word_seg['enc'], word_seg['suffix']):
                ## get all vocalized form of suffixes
                for pro_voc in SNC.COMP_PREFIX_LIST_TAGS[
                        word_seg['pro']]['vocalized']:
                    for enc_voc in SNC.COMP_SUFFIX_LIST_TAGS[
                            word_seg['enc']]['vocalized']:
                        for suf_voc in SNC.CONJ_SUFFIX_LIST_TAGS[
                                word_seg['suffix']]['vocalized']:
                            ## verify compatibility between proclitics and affix
                            if self.__check_clitic_affix(
                                    word_seg['noun_tuple'], pro_voc, enc_voc,
                                    suf_voc):
                                # get affix tags
                                affix_tags_voc = SNC.COMP_PREFIX_LIST_TAGS[pro_voc]['tags']\
                                  +SNC.COMP_SUFFIX_LIST_TAGS[enc_voc]['tags']\
                                  +SNC.CONJ_SUFFIX_LIST_TAGS[suf_voc]['tags']
                                word_seg_l4 = word_seg.copy()
                                word_seg_l4['suf_voc'] = suf_voc
                                word_seg_l4['enc_voc'] = enc_voc
                                word_seg_l4['affix_tags'] = affix_tags_voc
                                tmp_list.append(word_seg_l4)

        if debug: print("after check compatibility")
        if debug:
            print(arepr(noun_in))
            noun_tuples = [item['noun_tuple'] for item in tmp_list]
            print(print_table(noun_tuples))
        # Generate results
        if not tmp_list:
            self.set_error_code("Affixes not compatible")

        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # get voalized and vocalized without inflection
            #~ vocalized, semi_vocalized, _ = self.vocalize(
            voca_tuple_list = self.vocalize(
                word_seg['noun_tuple']['vocalized'], word_seg['pro'],
                word_seg['suf_voc'], word_seg['enc_voc'])
            for vocalized, semi_vocalized, _ in voca_tuple_list:
                #add some tags from dictionary entry as
                #mamnou3 min sarf and broken plural
                original_tags = []
                if word_seg['noun_tuple']['mankous'] == u"Tk":
                    original_tags.append(u"منقوص")
                # if there are many cases like feminin plural with mansoub and majrour
                if 'cases' in SNC.CONJ_SUFFIX_LIST_TAGS[word_seg['suf_voc']]:
                    list_cases = SNC.CONJ_SUFFIX_LIST_TAGS[
                        word_seg['suf_voc']]['cases']
                else:
                    list_cases = ('', )
                for case in list_cases:
                    voc_affix_case = word_seg['affix_tags'] + (case, )
                    # filter empty
                    voc_affix_case = [vac for vac in voc_affix_case if vac]
                    detailed_result.append(
                        wordcase.WordCase({
                            'word':
                            noun_in,
                            'affix': (word_seg['pro'], '', word_seg['suf_voc'],
                                      word_seg['enc_voc']),
                            'stem':
                            word_seg['stem_conj'],
                            'root':
                            ar.normalize_hamza(word_seg['noun_tuple'].get(
                                'root', '')),
                            'original':
                            word_seg['noun_tuple']['vocalized'],  #original,
                            'vocalized':
                            vocalized,
                            'semivocalized':
                            semi_vocalized,
                            'tags':
                            u':'.join(voc_affix_case),
                            'type':
                            u':'.join(
                                ['Noun', word_seg['noun_tuple']['wordtype']]),
                            'number':
                            word_seg['noun_tuple']['number'],
                            'gender':
                            word_seg['noun_tuple']['gender'],
                            'freq':
                            'freqnoun',  # to note the frequency type
                            'originaltags':
                            u':'.join(original_tags),
                            'syntax':
                            '',
                        }))
        if not detailed_result:
            self.set_error_code("Forms are not generated")

        if debug: print("after generate result")
        if debug: print(len(detailed_result))
        #~ if debug: print repr(detailed_result).replace('},','},\n').decode("unicode-escape")
        return detailed_result