def split_accourding_numbers(wordlist): try: try: wordlist = [x.decode('utf8') for x in wordlist] except Exception as e: pass lst = number.detect_numbers(wordlist) sen = [] s = "" for i in xrange(len(lst)): if lst[i] == u'DB': s = "" s += wordlist[i] elif lst[i] == u'DI': s += " " + wordlist[i] else: if s != "": sen.append(s) s = "" sen.append(wordlist[i]) if s != "": sen.append(s) return sen except Exception as e: return wordlist
u"لم أجد شيئا", u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا", u'من ثلاثمئة وخمسين بلدا ', u'من ثلاثمئة وخمسين بلدا ', u'من أربعمئة وخمسين بلدا ', u'السلام عليكم 2014', ] #~ arepr = arabrepr.ArabicRepr() for txt in TEXTS: word_list = araby.tokenize(txt) positions_phrases = number.detect_number_phrases_position(word_list) print "*******" print(txt).encode('utf8') print("positions", positions_phrases) nb_phrases = number.extract_number_phrases(txt) tag_list = number.detect_numbers(word_list) for tup in zip(tag_list, word_list): print repr(tup).decode('unicode-escape').encode('utf8') print "tashkeel", repr(number.pre_tashkeel_number(word_list)).decode('unicode-escape').encode('utf8') mynumber = myNumber() positions_phrases2 = mynumber.detect_positions(word_list) print "#########" print(txt).encode('utf8') print("positions", positions_phrases2) tag_list2 = mynumber.detect_chunks(word_list) for tup in zip(tag_list2, word_list): print repr(tup).decode('unicode-escape').encode('utf8') print "tashkeel", repr(mynumber.pretashkeel(word_list)).decode('unicode-escape').encode('utf8')