def test_sentence(self): self.assertEqual( avro.parse("ami banglay gan gai"), utf("আমি বাংলায় গান গাই")) self.assertEqual( avro.parse( "amader valObasa hoye gel ghas, kheye gel goru ar diye gelo ba^sh"), utf("আমাদের ভালোবাসা হয়ে গেল ঘাস, খেয়ে গেল গরু আর দিয়ে গেল বাঁশ"))
def test_sentence(self): self.assertEqual(avro.parse("ami banglay gan gai"), utf("আমি বাংলায় গান গাই")) self.assertEqual( avro.parse( "amader valObasa hoye gel ghas, kheye gel goru ar diye gelo ba^sh" ), utf("আমাদের ভালোবাসা হয়ে গেল ঘাস, খেয়ে গেল গরু আর দিয়ে গেল বাঁশ"))
def test_non_ascii(self): """Test parser response for non ascii characters Parser should return any non-ascii characters passed to it """ self.assertEqual(utf('ব'), avro.parse('ব')) self.assertEqual(utf('অভ্র'), avro.parse('অভ্র')) # mixed string self.assertEqual(utf('বআবা গো'), avro.parse('বaba gO')) self.assertEqual(utf('আমি বাংলায় গান গাই'), avro.parse('aমি বাংলায় gaন গাi'))
def test_non_ascii(self): """Test parser response for non ascii characters Parser should return any non-ascii characters passed to it """ self.assertEquals(utf('ব'), avro.parse('ব')) self.assertEquals(utf('অভ্র'), avro.parse('অভ্র')) # mixed string self.assertEquals(utf('বআবা গো'), avro.parse('বaba gO')) self.assertEquals(utf('আমি বাংলায় গান গাই'), avro.parse('aমি বাংলায় gaন গাi'))
def test_patterns_without_rules_not_from_config(self): """Tests all patterns not from config that don't have rules This test is done in addition to test_patterns_without_rules_from_config() to ensure that text passed manually to avro.parse are properly parsed when they don't exact match a pattern that has no rules specified. """ # Test some conjunctions self.assertEquals(utf("ভ্ল"), avro.parse("bhl")) self.assertEquals(utf("ব্ধ"), avro.parse("bdh")) self.assertEquals(utf("ব্ধ"), avro.parse("bdh")) self.assertEquals(utf("ড্ড"), avro.parse("DD")) # stunned stork! self.assertEquals(utf("স্তব্ধ বক"), avro.parse("stbdh bk"))
def test_basic(self): self.assertEqual(avro.parse("bhl"), utf("ভ্ল")) self.assertEqual(avro.parse("bj"), utf("ব্জ")) self.assertEqual(avro.parse("bd"), utf("ব্দ")) self.assertEqual(avro.parse("bb"), utf("ব্ব")) self.assertEqual(avro.parse("bl"), utf("ব্ল")) self.assertEqual(avro.parse("bh"), utf("ভ")) self.assertEqual(avro.parse("vl"), utf("ভ্ল")) self.assertEqual(avro.parse("b"), utf("ব")) self.assertEqual(avro.parse("v"), utf("ভ")) self.assertEqual(avro.parse("cNG"), utf("চ্ঞ")) self.assertEqual(avro.parse("cch"), utf("চ্ছ")) self.assertEqual(avro.parse("cc"), utf("চ্চ")) self.assertEqual(avro.parse("ch"), utf("ছ")) self.assertEqual(avro.parse("c"), utf("চ")) self.assertEqual(avro.parse("dhn"), utf("ধ্ন")) self.assertEqual(avro.parse("dhm"), utf("ধ্ম")) self.assertEqual(avro.parse("dgh"), utf("দ্ঘ")) self.assertEqual(avro.parse("ddh"), utf("দ্ধ")) self.assertEqual(avro.parse("dbh"), utf("দ্ভ")) self.assertEqual(avro.parse("dv"), utf("দ্ভ")) self.assertEqual(avro.parse("dm"), utf("দ্ম")) self.assertEqual(avro.parse("DD"), utf("ড্ড")) self.assertEqual(avro.parse("Dh"), utf("ঢ")) self.assertEqual(avro.parse("dh"), utf("ধ")) self.assertEqual(avro.parse("dg"), utf("দ্গ")) self.assertEqual(avro.parse("dd"), utf("দ্দ")) self.assertEqual(avro.parse("D"), utf("ড")) self.assertEqual(avro.parse("d"), utf("দ")) self.assertEqual(avro.parse("..."), utf("...")) self.assertEqual(avro.parse(".`"), utf(".")) self.assertEqual(avro.parse(".."), utf("।।")) self.assertEqual(avro.parse("."), utf("।")) self.assertEqual(avro.parse("ghn"), utf("ঘ্ন")) self.assertEqual(avro.parse("Ghn"), utf("ঘ্ন")) self.assertEqual(avro.parse("gdh"), utf("গ্ধ")) self.assertEqual(avro.parse("gN"), utf("গ্ণ")) self.assertEqual(avro.parse("GN"), utf("গ্ণ")) self.assertEqual(avro.parse("gn"), utf("গ্ন")) self.assertEqual(avro.parse("gm"), utf("গ্ম")) self.assertEqual(avro.parse("Gm"), utf("গ্ম")) self.assertEqual(avro.parse("gl"), utf("গ্ল")) self.assertEqual(avro.parse("Gl"), utf("গ্ল")) self.assertEqual(avro.parse("gg"), utf("জ্ঞ")) self.assertEqual(avro.parse("GG"), utf("জ্ঞ")) self.assertEqual(avro.parse("Gg"), utf("জ্ঞ")) self.assertEqual(avro.parse("gG"), utf("জ্ঞ")) self.assertEqual(avro.parse("gh"), utf("ঘ")) self.assertEqual(avro.parse("Gh"), utf("ঘ")) self.assertEqual(avro.parse("g"), utf("গ")) self.assertEqual(avro.parse("hN"), utf("হ্ণ")) self.assertEqual(avro.parse("hn"), utf("হ্ন")) self.assertEqual(avro.parse("hm"), utf("হ্ম")) self.assertEqual(avro.parse("hl"), utf("হ্ল")) self.assertEqual(avro.parse("h"), utf("হ")) self.assertEqual(avro.parse("jjh"), utf("জ্ঝ")) self.assertEqual(avro.parse("jNG"), utf("জ্ঞ")) self.assertEqual(avro.parse("jh"), utf("ঝ")) self.assertEqual(avro.parse("jj"), utf("জ্জ")) self.assertEqual(avro.parse("j"), utf("জ")) self.assertEqual(avro.parse("J"), utf("জ")) self.assertEqual(avro.parse("kkhN"), utf("ক্ষ্ণ")) self.assertEqual(avro.parse("kShN"), utf("ক্ষ্ণ")) self.assertEqual(avro.parse("kkhm"), utf("ক্ষ্ম")) self.assertEqual(avro.parse("kShm"), utf("ক্ষ্ম")) self.assertEqual(avro.parse("kxN"), utf("ক্ষ্ণ")) self.assertEqual(avro.parse("kxm"), utf("ক্ষ্ম")) self.assertEqual(avro.parse("kkh"), utf("ক্ষ")) self.assertEqual(avro.parse("kSh"), utf("ক্ষ")) self.assertEqual(avro.parse("ksh"), utf("কশ")) self.assertEqual(avro.parse("kx"), utf("ক্ষ")) self.assertEqual(avro.parse("kk"), utf("ক্ক")) self.assertEqual(avro.parse("kT"), utf("ক্ট")) self.assertEqual(avro.parse("kt"), utf("ক্ত")) self.assertEqual(avro.parse("kl"), utf("ক্ল")) self.assertEqual(avro.parse("ks"), utf("ক্স")) self.assertEqual(avro.parse("kh"), utf("খ")) self.assertEqual(avro.parse("k"), utf("ক")) self.assertEqual(avro.parse("lbh"), utf("ল্ভ")) self.assertEqual(avro.parse("ldh"), utf("ল্ধ")) self.assertEqual(avro.parse("lkh"), utf("লখ")) self.assertEqual(avro.parse("lgh"), utf("লঘ")) self.assertEqual(avro.parse("lph"), utf("লফ")) self.assertEqual(avro.parse("lk"), utf("ল্ক")) self.assertEqual(avro.parse("lg"), utf("ল্গ")) self.assertEqual(avro.parse("lT"), utf("ল্ট")) self.assertEqual(avro.parse("lD"), utf("ল্ড")) self.assertEqual(avro.parse("lp"), utf("ল্প")) self.assertEqual(avro.parse("lv"), utf("ল্ভ")) self.assertEqual(avro.parse("lm"), utf("ল্ম")) self.assertEqual(avro.parse("ll"), utf("ল্ল")) self.assertEqual(avro.parse("lb"), utf("ল্ব")) self.assertEqual(avro.parse("l"), utf("ল")) self.assertEqual(avro.parse("mth"), utf("ম্থ")) self.assertEqual(avro.parse("mph"), utf("ম্ফ")) self.assertEqual(avro.parse("mbh"), utf("ম্ভ")) self.assertEqual(avro.parse("mpl"), utf("মপ্ল")) self.assertEqual(avro.parse("mn"), utf("ম্ন")) self.assertEqual(avro.parse("mp"), utf("ম্প")) self.assertEqual(avro.parse("mv"), utf("ম্ভ")) self.assertEqual(avro.parse("mm"), utf("ম্ম")) self.assertEqual(avro.parse("ml"), utf("ম্ল")) self.assertEqual(avro.parse("mb"), utf("ম্ব")) self.assertEqual(avro.parse("mf"), utf("ম্ফ")) self.assertEqual(avro.parse("m"), utf("ম")) self.assertEqual(avro.parse("0"), utf("০")) self.assertEqual(avro.parse("1"), utf("১")) self.assertEqual(avro.parse("2"), utf("২")) self.assertEqual(avro.parse("3"), utf("৩")) self.assertEqual(avro.parse("4"), utf("৪")) self.assertEqual(avro.parse("5"), utf("৫")) self.assertEqual(avro.parse("6"), utf("৬")) self.assertEqual(avro.parse("7"), utf("৭")) self.assertEqual(avro.parse("8"), utf("৮")) self.assertEqual(avro.parse("9"), utf("৯")) self.assertEqual(avro.parse("NgkSh"), utf("ঙ্ক্ষ")) self.assertEqual(avro.parse("Ngkkh"), utf("ঙ্ক্ষ")) self.assertEqual(avro.parse("NGch"), utf("ঞ্ছ")) self.assertEqual(avro.parse("Nggh"), utf("ঙ্ঘ")) self.assertEqual(avro.parse("Ngkh"), utf("ঙ্খ")) self.assertEqual(avro.parse("NGjh"), utf("ঞ্ঝ")) self.assertEqual(avro.parse("ngOU"), utf("ঙ্গৌ")) self.assertEqual(avro.parse("ngOI"), utf("ঙ্গৈ")) self.assertEqual(avro.parse("Ngkx"), utf("ঙ্ক্ষ")) self.assertEqual(avro.parse("NGc"), utf("ঞ্চ")) self.assertEqual(avro.parse("nch"), utf("ঞ্ছ")) self.assertEqual(avro.parse("njh"), utf("ঞ্ঝ")) self.assertEqual(avro.parse("ngh"), utf("ঙ্ঘ")) self.assertEqual(avro.parse("Ngk"), utf("ঙ্ক")) self.assertEqual(avro.parse("Ngx"), utf("ঙ্ষ")) self.assertEqual(avro.parse("Ngg"), utf("ঙ্গ")) self.assertEqual(avro.parse("Ngm"), utf("ঙ্ম")) self.assertEqual(avro.parse("NGj"), utf("ঞ্জ")) self.assertEqual(avro.parse("ndh"), utf("ন্ধ")) self.assertEqual(avro.parse("nTh"), utf("ন্ঠ")) self.assertEqual(avro.parse("NTh"), utf("ণ্ঠ")) self.assertEqual(avro.parse("nth"), utf("ন্থ")) self.assertEqual(avro.parse("nkh"), utf("ঙ্খ")) self.assertEqual(avro.parse("ngo"), utf("ঙ্গ")) self.assertEqual(avro.parse("nga"), utf("ঙ্গা")) self.assertEqual(avro.parse("ngi"), utf("ঙ্গি")) self.assertEqual(avro.parse("ngI"), utf("ঙ্গী")) self.assertEqual(avro.parse("ngu"), utf("ঙ্গু")) self.assertEqual(avro.parse("ngU"), utf("ঙ্গূ")) self.assertEqual(avro.parse("nge"), utf("ঙ্গে")) self.assertEqual(avro.parse("ngO"), utf("ঙ্গো")) self.assertEqual(avro.parse("NDh"), utf("ণ্ঢ")) self.assertEqual(avro.parse("nsh"), utf("নশ")) self.assertEqual(avro.parse("Ngr"), utf("ঙর")) self.assertEqual(avro.parse("NGr"), utf("ঞর")) self.assertEqual(avro.parse("ngr"), utf("ংর")) self.assertEqual(avro.parse("nj"), utf("ঞ্জ")) self.assertEqual(avro.parse("Ng"), utf("ঙ")) self.assertEqual(avro.parse("NG"), utf("ঞ")) self.assertEqual(avro.parse("nk"), utf("ঙ্ক")) self.assertEqual(avro.parse("ng"), utf("ং")) self.assertEqual(avro.parse("nn"), utf("ন্ন")) self.assertEqual(avro.parse("NN"), utf("ণ্ণ")) self.assertEqual(avro.parse("Nn"), utf("ণ্ন")) self.assertEqual(avro.parse("nm"), utf("ন্ম")) self.assertEqual(avro.parse("Nm"), utf("ণ্ম")) self.assertEqual(avro.parse("nd"), utf("ন্দ")) self.assertEqual(avro.parse("nT"), utf("ন্ট")) self.assertEqual(avro.parse("NT"), utf("ণ্ট")) self.assertEqual(avro.parse("nD"), utf("ন্ড")) self.assertEqual(avro.parse("ND"), utf("ণ্ড")) self.assertEqual(avro.parse("nt"), utf("ন্ত")) self.assertEqual(avro.parse("ns"), utf("ন্স")) self.assertEqual(avro.parse("nc"), utf("ঞ্চ")) self.assertEqual(avro.parse("n"), utf("ন")) self.assertEqual(avro.parse("N"), utf("ণ")) self.assertEqual(avro.parse("OI`"), utf("ৈ")) self.assertEqual(avro.parse("OU`"), utf("ৌ")) self.assertEqual(avro.parse("O`"), utf("ো")) self.assertEqual(avro.parse("OI"), utf("ঐ")) self.assertEqual(avro.parse("kOI"), utf("কৈ")) self.assertEqual(avro.parse(" OI"), utf(" ঐ")) self.assertEqual(avro.parse("(OI"), utf("(ঐ")) self.assertEqual(avro.parse(".OI"), utf("।ঐ")) self.assertEqual(avro.parse("OU"), utf("ঔ")) self.assertEqual(avro.parse("kOU"), utf("কৌ")) self.assertEqual(avro.parse(" OU"), utf(" ঔ")) self.assertEqual(avro.parse("-OU"), utf("-ঔ")) self.assertEqual(avro.parse(",,OU"), utf("্ঔ")) self.assertEqual(avro.parse("O"), utf("ও")) self.assertEqual(avro.parse("pO"), utf("পো")) self.assertEqual(avro.parse(" O"), utf(" ও")) self.assertEqual(avro.parse("iO"), utf("ইও")) self.assertEqual(avro.parse("`O"), utf("ও")) self.assertEqual(avro.parse("phl"), utf("ফ্ল")) self.assertEqual(avro.parse("pT"), utf("প্ট")) self.assertEqual(avro.parse("pt"), utf("প্ত")) self.assertEqual(avro.parse("pn"), utf("প্ন")) self.assertEqual(avro.parse("pp"), utf("প্প")) self.assertEqual(avro.parse("pl"), utf("প্ল")) self.assertEqual(avro.parse("ps"), utf("প্স")) self.assertEqual(avro.parse("ph"), utf("ফ")) self.assertEqual(avro.parse("fl"), utf("ফ্ল")) self.assertEqual(avro.parse("f"), utf("ফ")) self.assertEqual(avro.parse("p"), utf("প")) self.assertEqual(avro.parse("rri`"), utf("ৃ")) self.assertEqual(avro.parse("rri"), utf("ঋ")) self.assertEqual(avro.parse("krri"), utf("কৃ")) self.assertEqual(avro.parse("Irri"), utf("ঈঋ")) self.assertEqual(avro.parse("^rri"), utf("ঁঋ")) self.assertEqual(avro.parse(":rri"), utf("ঃঋ")) self.assertEqual(avro.parse("rZ"), utf("র্য")) self.assertEqual(avro.parse("krZ"), utf("ক্র্য")) self.assertEqual(avro.parse("rrZ"), utf("রর্য")) self.assertEqual(avro.parse("yrZ"), utf("ইয়র্য")) self.assertEqual(avro.parse("wrZ"), utf("ওর্য")) self.assertEqual(avro.parse("xrZ"), utf("এক্সর্য")) self.assertEqual(avro.parse("irZ"), utf("ইর্য")) self.assertEqual(avro.parse("-rZ"), utf("-র্য")) self.assertEqual(avro.parse("rrrZ"), utf("ররর্য")) self.assertEqual(avro.parse("ry"), utf("র্য")) self.assertEqual(avro.parse("qry"), utf("ক্র্য")) self.assertEqual(avro.parse("rry"), utf("রর্য")) self.assertEqual(avro.parse("yry"), utf("ইয়র্য")) self.assertEqual(avro.parse("wry"), utf("ওর্য")) self.assertEqual(avro.parse("xry"), utf("এক্সর্য")) self.assertEqual(avro.parse("0ry"), utf("০র্য")) self.assertEqual(avro.parse("rrrry"), utf("রররর্য")) self.assertEqual(avro.parse("Rry"), utf("ড়্র্য")) self.assertEqual(avro.parse("rr"), utf("রর")) self.assertEqual(avro.parse("arr"), utf("আরর")) self.assertEqual(avro.parse("arrk"), utf("আর্ক")) self.assertEqual(avro.parse("arra"), utf("আররা")) self.assertEqual(avro.parse("arr"), utf("আরর")) self.assertEqual(avro.parse("arr!"), utf("আরর!")) self.assertEqual(avro.parse("krr"), utf("ক্রর")) self.assertEqual(avro.parse("krra"), utf("ক্ররা")) self.assertEqual(avro.parse("Rg"), utf("ড়্গ")) self.assertEqual(avro.parse("Rh"), utf("ঢ়")) self.assertEqual(avro.parse("R"), utf("ড়")) self.assertEqual(avro.parse("r"), utf("র")) self.assertEqual(avro.parse("or"), utf("অর")) self.assertEqual(avro.parse("mr"), utf("ম্র")) self.assertEqual(avro.parse("1r"), utf("১র")) self.assertEqual(avro.parse("+r"), utf("+র")) self.assertEqual(avro.parse("rr"), utf("রর")) self.assertEqual(avro.parse("yr"), utf("ইয়র")) self.assertEqual(avro.parse("wr"), utf("ওর")) self.assertEqual(avro.parse("xr"), utf("এক্সর")) self.assertEqual(avro.parse("zr"), utf("য্র")) self.assertEqual(avro.parse("mri"), utf("ম্রি")) self.assertEqual(avro.parse("shch"), utf("শ্ছ")) self.assertEqual(avro.parse("ShTh"), utf("ষ্ঠ")) self.assertEqual(avro.parse("Shph"), utf("ষ্ফ")) self.assertEqual(avro.parse("Sch"), utf("শ্ছ")) self.assertEqual(avro.parse("skl"), utf("স্ক্ল")) self.assertEqual(avro.parse("skh"), utf("স্খ")) self.assertEqual(avro.parse("sth"), utf("স্থ")) self.assertEqual(avro.parse("sph"), utf("স্ফ")) self.assertEqual(avro.parse("shc"), utf("শ্চ")) self.assertEqual(avro.parse("sht"), utf("শ্ত")) self.assertEqual(avro.parse("shn"), utf("শ্ন")) self.assertEqual(avro.parse("shm"), utf("শ্ম")) self.assertEqual(avro.parse("shl"), utf("শ্ল")) self.assertEqual(avro.parse("Shk"), utf("ষ্ক")) self.assertEqual(avro.parse("ShT"), utf("ষ্ট")) self.assertEqual(avro.parse("ShN"), utf("ষ্ণ")) self.assertEqual(avro.parse("Shp"), utf("ষ্প")) self.assertEqual(avro.parse("Shf"), utf("ষ্ফ")) self.assertEqual(avro.parse("Shm"), utf("ষ্ম")) self.assertEqual(avro.parse("spl"), utf("স্প্ল")) self.assertEqual(avro.parse("sk"), utf("স্ক")) self.assertEqual(avro.parse("Sc"), utf("শ্চ")) self.assertEqual(avro.parse("sT"), utf("স্ট")) self.assertEqual(avro.parse("st"), utf("স্ত")) self.assertEqual(avro.parse("sn"), utf("স্ন")) self.assertEqual(avro.parse("sp"), utf("স্প")) self.assertEqual(avro.parse("sf"), utf("স্ফ")) self.assertEqual(avro.parse("sm"), utf("স্ম")) self.assertEqual(avro.parse("sl"), utf("স্ল")) self.assertEqual(avro.parse("sh"), utf("শ")) self.assertEqual(avro.parse("Sc"), utf("শ্চ")) self.assertEqual(avro.parse("St"), utf("শ্ত")) self.assertEqual(avro.parse("Sn"), utf("শ্ন")) self.assertEqual(avro.parse("Sm"), utf("শ্ম")) self.assertEqual(avro.parse("Sl"), utf("শ্ল")) self.assertEqual(avro.parse("Sh"), utf("ষ")) self.assertEqual(avro.parse("s"), utf("স")) self.assertEqual(avro.parse("S"), utf("শ")) self.assertEqual(avro.parse("oo"), utf("উ")) self.assertEqual(avro.parse("OO"), utf("ওও")) self.assertEqual(avro.parse("oo`"), utf("ু")) self.assertEqual(avro.parse("koo"), utf("কু")) self.assertEqual(avro.parse("ooo"), utf("উঅ")) self.assertEqual(avro.parse("!oo"), utf("!উ")) self.assertEqual(avro.parse("!ooo"), utf("!উঅ")) self.assertEqual(avro.parse("aoo"), utf("আউ")) self.assertEqual(avro.parse("oop"), utf("উপ")) self.assertEqual(avro.parse("ooo`"), utf("উ")) self.assertEqual("", avro.parse("o`")) self.assertEqual(avro.parse("oZ"), utf("অ্য")) self.assertEqual(avro.parse("oY"), utf("অয়")) self.assertEqual(avro.parse("o"), utf("অ")) self.assertEqual(avro.parse("!o"), utf("!অ")) self.assertEqual(avro.parse("^o"), utf("ঁঅ")) self.assertEqual(avro.parse("*o"), utf("*অ")) self.assertEqual(avro.parse("io"), utf("ইও")) self.assertEqual(avro.parse("yo"), utf("ইয়")) self.assertEqual(avro.parse("no"), utf("ন")) self.assertEqual(avro.parse("tth"), utf("ত্থ")) self.assertEqual(avro.parse("t``"), utf("ৎ")) self.assertEqual(avro.parse("`t``"), utf("ৎ")) self.assertEqual(avro.parse("t``t``"), utf("ৎৎ")) self.assertEqual(avro.parse("t```"), utf("ৎ")) self.assertEqual(avro.parse("TT"), utf("ট্ট")) self.assertEqual(avro.parse("Tm"), utf("ট্ম")) self.assertEqual(avro.parse("Th"), utf("ঠ")) self.assertEqual(avro.parse("tn"), utf("ত্ন")) self.assertEqual(avro.parse("tm"), utf("ত্ম")) self.assertEqual(avro.parse("th"), utf("থ")) self.assertEqual(avro.parse("tt"), utf("ত্ত")) self.assertEqual(avro.parse("T"), utf("ট")) self.assertEqual(avro.parse("t"), utf("ত")) self.assertEqual(avro.parse("aZ"), utf("অ্যা")) self.assertEqual(avro.parse("aaZ"), utf("আঅ্যা")) self.assertEqual(avro.parse("AZ"), utf("অ্যা")) self.assertEqual(avro.parse("a`"), utf("া")) self.assertEqual(avro.parse("a``"), utf("া")) self.assertEqual(avro.parse("ka`"), utf("কা")) self.assertEqual(avro.parse("A`"), utf("া")) self.assertEqual(avro.parse("a"), utf("আ")) self.assertEqual(avro.parse("`a"), utf("আ")) self.assertEqual(avro.parse("k`a"), utf("কআ")) self.assertEqual(avro.parse("ia"), utf("ইয়া")) self.assertEqual(avro.parse("aaaa`"), utf("আআআা")) self.assertEqual(avro.parse("i`"), utf("ি")) self.assertEqual(avro.parse("i"), utf("ই")) self.assertEqual(avro.parse("`i"), utf("ই")) self.assertEqual(avro.parse("hi"), utf("হি")) self.assertEqual(avro.parse("ih"), utf("ইহ")) self.assertEqual(avro.parse("i`h"), utf("িহ")) self.assertEqual(avro.parse("I`"), utf("ী")) self.assertEqual(avro.parse("I"), utf("ঈ")) self.assertEqual(avro.parse("cI"), utf("চী")) self.assertEqual(avro.parse("Ix"), utf("ঈক্স")) self.assertEqual(avro.parse("II"), utf("ঈঈ")) self.assertEqual(avro.parse("0I"), utf("০ঈ")) self.assertEqual(avro.parse("oI"), utf("অঈ")) self.assertEqual(avro.parse("u`"), utf("ু")) self.assertEqual(avro.parse("u"), utf("উ")) self.assertEqual(avro.parse("ku"), utf("কু")) self.assertEqual(avro.parse("uk"), utf("উক")) self.assertEqual(avro.parse("uu"), utf("উউ")) self.assertEqual(avro.parse("iu"), utf("ইউ")) self.assertEqual(avro.parse("&u"), utf("&উ")) self.assertEqual(avro.parse("u&"), utf("উ&")) self.assertEqual(avro.parse("U`"), utf("ূ")) self.assertEqual(avro.parse("U"), utf("ঊ")) self.assertEqual(avro.parse("yU"), utf("ইয়ূ")) self.assertEqual(avro.parse("Uy"), utf("ঊয়")) self.assertEqual(avro.parse("^U"), utf("ঁঊ")) self.assertEqual(avro.parse("U^"), utf("ঊঁ")) self.assertEqual(avro.parse("EE"), utf("ঈ")) self.assertEqual(avro.parse("ee"), utf("ঈ")) self.assertEqual(avro.parse("Ee"), utf("ঈ")) self.assertEqual(avro.parse("eE"), utf("ঈ")) self.assertEqual(avro.parse("ee`"), utf("ী")) self.assertEqual(avro.parse("kee"), utf("কী")) self.assertEqual(avro.parse("eek"), utf("ঈক")) self.assertEqual(avro.parse("0ee"), utf("০ঈ")) self.assertEqual(avro.parse("ee8"), utf("ঈ৮")) self.assertEqual(avro.parse("(ee)"), utf("(ঈ)")) self.assertEqual(avro.parse("e`"), utf("ে")) self.assertEqual(avro.parse("e"), utf("এ")) self.assertEqual(avro.parse("ke"), utf("কে")) self.assertEqual(avro.parse("we"), utf("ওয়ে")) self.assertEqual(avro.parse("#e#"), utf("#এ#")) self.assertEqual(avro.parse("`e`"), utf("ে")) self.assertEqual(avro.parse("z"), utf("য")) self.assertEqual(avro.parse("Z"), utf("্য")) self.assertEqual(avro.parse("rZ"), utf("র্য")) self.assertEqual(avro.parse("kZS"), utf("ক্যশ")) self.assertEqual(avro.parse("y"), utf("ইয়")) self.assertEqual(avro.parse("oy"), utf("অয়")) self.assertEqual(avro.parse("ky"), utf("ক্য")) self.assertEqual(avro.parse("ya"), utf("ইয়া")) self.assertEqual(avro.parse("yaa"), utf("ইয়াআ")) self.assertEqual(avro.parse("Y"), utf("য়")) self.assertEqual(avro.parse("YY"), utf("য়য়")) self.assertEqual(avro.parse("iY"), utf("ইয়")) self.assertEqual(avro.parse("kY"), utf("কয়")) self.assertEqual(avro.parse("q"), utf("ক")) self.assertEqual(avro.parse("Q"), utf("ক")) self.assertEqual(avro.parse("w"), utf("ও")) self.assertEqual(avro.parse("wa"), utf("ওয়া")) self.assertEqual(avro.parse("-wa-"), utf("-ওয়া-")) self.assertEqual(avro.parse("woo"), utf("ওয়ু")) self.assertEqual(avro.parse("wre"), utf("ওরে")) self.assertEqual(avro.parse("kw"), utf("ক্ব")) self.assertEqual(avro.parse("x"), utf("এক্স")) self.assertEqual(avro.parse("ex"), utf("এক্স")) self.assertEqual(avro.parse("bx"), utf("বক্স")) self.assertEqual(avro.parse(":`"), utf(":")) self.assertEqual(avro.parse(":"), utf("ঃ")) self.assertEqual(avro.parse("^`"), utf("^")) self.assertEqual(avro.parse("^"), utf("ঁ")) self.assertEqual(avro.parse("k^"), utf("কঁ")) self.assertEqual(avro.parse("k^i"), utf("কঁই")) self.assertEqual(avro.parse("ki^"), utf("কিঁ")) self.assertEqual(avro.parse(",,"), utf("্")) self.assertEqual(avro.parse(",,,"), utf("্,")) self.assertEqual(avro.parse(",,`,"), utf("্,")) self.assertEqual(avro.parse("`,,"), utf("্")) self.assertEqual(avro.parse(",`,"), utf(",,")) self.assertEqual(avro.parse("$"), utf("৳")) self.assertEqual("", avro.parse("`")) self.assertEqual(avro.parse("bdh"), utf("ব্ধ"))
def test_patterns_with_rules_svaravarna(self): """Test patterns - with rules - svaravarna""" # Test some numbers self.assertEquals(utf("অ"), avro.parse("o")) self.assertEquals(utf("আ"), avro.parse("a")) self.assertEquals(utf("ই"), avro.parse("i")) self.assertEquals(utf("ঈ"), avro.parse("I")) self.assertEquals(utf("উ"), avro.parse("u")) self.assertEquals(utf("উ"), avro.parse("oo")) self.assertEquals(utf("ঊ"), avro.parse("U")) self.assertEquals(utf("এ"), avro.parse("e")) self.assertEquals(utf("ঐ"), avro.parse("OI")) self.assertEquals(utf("ও"), avro.parse("O")) self.assertEquals(utf("ঔ"), avro.parse("OU"))
def test_patterns_punctuations(self): """Tests patterns - punctuations""" # Test some punctuations self.assertEquals(utf("।"), avro.parse(".")) self.assertEquals(utf("।।"), avro.parse("..")) self.assertEquals(utf("..."), avro.parse("..."))
def test_patterns_numbers(self): """Test patterns - numbers""" # Test some numbers self.assertEquals(utf("০"), avro.parse("0")) self.assertEquals(utf("১"), avro.parse("1")) self.assertEquals(utf("২"), avro.parse("2")) self.assertEquals(utf("৩"), avro.parse("3")) self.assertEquals(utf("৪"), avro.parse("4")) self.assertEquals(utf("৫"), avro.parse("5")) self.assertEquals(utf("৬"), avro.parse("6")) self.assertEquals(utf("৭"), avro.parse("7")) self.assertEquals(utf("৮"), avro.parse("8")) self.assertEquals(utf("৯"), avro.parse("9")) self.assertEquals(utf("১১২"), avro.parse("112"))
def test_words_with_punctuations(self): """Test parsing of words with punctuations""" self.assertEquals(utf('আয়রে,'), avro.parse('ayre,')) self.assertEquals(utf('ভোলা'), avro.parse('bhOla')) self.assertEquals(utf('খেয়াল'), avro.parse('kheyal')) self.assertEquals(utf('খোলা'), avro.parse('khOla'))
def test_sentences(self): """Test parsing of sentences""" self.assertEquals(utf('আমি বাংলায় গান গাই'), avro.parse('ami banglay gan gai'))
def test_sentences(self): """Test parsing of sentences""" self.assertEqual(utf('আমি বাংলায় গান গাই'), avro.parse('ami banglay gan gai'))
def parse(text): """Parses input text, matches and replaces using avrodict If a valid replacement is found, returns the replaced string. If no replacement is found, returns the input text. Usage: :: from pyavrophonetic import avro avro.parse("ami banglay gan gai") """ # Sanitize text case to meet phonetic comparison standards fixed_text = validate.fix_string_case(utf(text)) # prepare output list output = [] # cursor end point cur_end = 0 # iterate through input text for cur, i in enumerate(fixed_text): # Trap characters with unicode encoding errors try: i.encode('utf-8') except UnicodeDecodeError: uni_pass = False else: uni_pass = True # Default value for match match = {'matched': False} # Check cur is greater than or equals cur_end. If cursor is in # a position that has alread been processed/replaced, we don't # process anything at all if not uni_pass: cur_end = cur + 1 output.append(i) elif cur >= cur_end and uni_pass: # Try looking in non rule patterns with current string portion match = match_non_rule_patterns(fixed_text, cur) # Check if non rule patterns have matched if match["matched"]: output.append(match["replaced"]) cur_end = cur + len(match["found"]) else: # if non rule patterns have not matched, try rule patterns match = match_rule_patterns(fixed_text, cur) # Check if rule patterns have matched if match["matched"]: # Update cur_end as cursor + length of match found cur_end = cur + len(match["found"]) # Process its rules replaced = process_rules(rules = match["rules"], fixed_text = fixed_text, cur = cur, cur_end = cur_end) # If any rules match, output replacement from the # rule, else output it's default top-level/default # replacement if replaced is not None: # Rule has matched output.append(replaced) else: # No rules have matched # output common match output.append(match["replaced"]) # If none matched, append present cursor value if not match["matched"]: cur_end = cur + 1 output.append(i) # End looping through input text and produce output return ''.join(output)
def test_words_with_punctuations(self): """Test parsing of words with punctuations""" self.assertEqual(utf('আয়রে,'), avro.parse('ayre,')) self.assertEqual(utf('ভোলা'), avro.parse('bhOla')) self.assertEqual(utf('খেয়াল'), avro.parse('kheyal')) self.assertEqual(utf('খোলা'), avro.parse('khOla'))
def test_other(self): self.assertEqual(avro.parse("!"), utf("!"))
def test_patterns_numbers(self): """Test patterns - numbers""" # Test some numbers self.assertEqual(utf("০"), avro.parse("0")) self.assertEqual(utf("১"), avro.parse("1")) self.assertEqual(utf("২"), avro.parse("2")) self.assertEqual(utf("৩"), avro.parse("3")) self.assertEqual(utf("৪"), avro.parse("4")) self.assertEqual(utf("৫"), avro.parse("5")) self.assertEqual(utf("৬"), avro.parse("6")) self.assertEqual(utf("৭"), avro.parse("7")) self.assertEqual(utf("৮"), avro.parse("8")) self.assertEqual(utf("৯"), avro.parse("9")) self.assertEqual(utf("১১২"), avro.parse("112"))
def test_patterns_punctuations(self): """Tests patterns - punctuations""" # Test some punctuations self.assertEqual(utf("।"), avro.parse(".")) self.assertEqual(utf("।।"), avro.parse("..")) self.assertEqual(utf("..."), avro.parse("..."))
def test_patterns_with_rules_svaravarna(self): """Test patterns - with rules - svaravarna""" # Test some numbers self.assertEqual(utf("অ"), avro.parse("o")) self.assertEqual(utf("আ"), avro.parse("a")) self.assertEqual(utf("ই"), avro.parse("i")) self.assertEqual(utf("ঈ"), avro.parse("I")) self.assertEqual(utf("উ"), avro.parse("u")) self.assertEqual(utf("উ"), avro.parse("oo")) self.assertEqual(utf("ঊ"), avro.parse("U")) self.assertEqual(utf("এ"), avro.parse("e")) self.assertEqual(utf("ঐ"), avro.parse("OI")) self.assertEqual(utf("ও"), avro.parse("O")) self.assertEqual(utf("ঔ"), avro.parse("OU"))