def split_accourding_numbers(wordlist):

    try:
        try:
            wordlist = [x.decode('utf8') for x in wordlist]
        except Exception as e:
            pass
        lst = number.detect_numbers(wordlist)
        sen = []
        s = ""
        for i in xrange(len(lst)):
            if lst[i] == u'DB':
                s = ""
                s += wordlist[i]
            elif lst[i] == u'DI':
                s += " " + wordlist[i]
            else:
                if s != "":
                    sen.append(s)
                    s = ""
                    sen.append(wordlist[i])
        if s != "":
            sen.append(s)
        return sen
    except Exception as e:
        return wordlist
Beispiel #2
0
             u"لم أجد شيئا",
             u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
             u'من ثلاثمئة وخمسين بلدا ',
             u'من ثلاثمئة وخمسين بلدا ',
             u'من أربعمئة وخمسين بلدا ',
             u'السلام عليكم 2014',
            ]
    #~ arepr = arabrepr.ArabicRepr()
    for txt in TEXTS:
        word_list = araby.tokenize(txt)
        positions_phrases = number.detect_number_phrases_position(word_list)
        print "*******"
        print(txt).encode('utf8')
        print("positions", positions_phrases)
        nb_phrases = number.extract_number_phrases(txt)
        tag_list = number.detect_numbers(word_list)
        for tup in zip(tag_list, word_list):
            print repr(tup).decode('unicode-escape').encode('utf8')
        
        print "tashkeel", repr(number.pre_tashkeel_number(word_list)).decode('unicode-escape').encode('utf8')

        mynumber = myNumber()
        positions_phrases2 = mynumber.detect_positions(word_list)
        print "#########"
        print(txt).encode('utf8')
        print("positions", positions_phrases2)
        tag_list2 = mynumber.detect_chunks(word_list)
        for tup in zip(tag_list2, word_list):
            print repr(tup).decode('unicode-escape').encode('utf8')
        
        print "tashkeel", repr(mynumber.pretashkeel(word_list)).decode('unicode-escape').encode('utf8')