def __init__(self, script='wx', raw='data/all_normalization_predictions_wx', rebuild=False): self.script = script self.raw = raw self.rebuild = rebuild self.pickled = './data/normdict.p' # converts wx2utf self.con = wxConvert(order='utf2wx', lang='hin') self.revcon = wxConvert(order='wx2utf', lang='hin') self.trn = transliterator(source='eng', target='hin') #rebuild dict if not os.path.exists(self.pickled) or self.rebuild: self.create_norm_pickle() self.normdict = pickle.load(open(self.pickled, 'rb'))
def fit(self): self.con = wxConvert(order='utf2wx', rmask=False) dist_dir = os.path.dirname(os.path.abspath(__file__)) #load models sys.path.append('%s/_utils' %dist_dir) self.coef_ = np.load('%s/models/hu_coef.npy' %dist_dir)[0] self.classes_ = np.load('%s/models/hu_classes.npy' %dist_dir)[0] self.vectorizer_ = np.load('%s/models/hu_sparse-vec.npy' %dist_dir)[0] self.intercept_init_ = np.load('%s/models/hu_intercept_init.npy' %dist_dir) self.intercept_trans_ = np.load('%s/models/hu_intercept_trans.npy' %dist_dir) self.intercept_final_ = np.load('%s/models/hu_intercept_final.npy' %dist_dir) #initialize character maps self.letters = set(string.ascii_letters) self.mask_roman = re.compile(r'([a-zA-Z]+)') self.rom_dev = re.compile(ur'([a-zA-Z])([\u0900-\u097f])') self.dev_rom = re.compile(ur'([\u0900-\u097f])([a-zA-Z])') self.non_alpha = re.compile(r"([^a-zA-Z%s]+)" %(self.esc_ch)) #initialize punctuation map table self.punkt_tbl = dict() with open('%s/mapping/punkt.map' %dist_dir) as punkt_fp: for line in punkt_fp: line = line.decode('utf-8') s,t = line.split() if s in ["'", '"']: continue self.punkt_tbl[ord(s)] = t
def processInput(ifp, ofp, args, parser): convertor = wxConvert(order='wx2utf', lang='hin', format_='conll') sentences = ifp.read().split('\n\n') ifp.close() for sentence in sentences: if not sentence.strip():continue sentence = "\n".join(list(rawtoconll.toConll(sentence))) sentence = convertor.convert(sentence) out_parse = parser.getParse([sentence], sflag=False) if out_parse: ofp.write("%s\n\n" %(out_parse[0]))
def __init__(self, lang): self.lookup = dict() self.n , self.tab, self.space = 4, '~~', '^^' self.con = wxConvert(order='wx2utf', lang=lang) lang = lang[0] path = os.path.abspath(__file__).rpartition('/')[0] sys.path.append(path) self.vec = np.load('%s/models/e%s_vec.npy' %(path, lang))[0] self.coef_ = np.load('%s/models/e%s_coef.npy' %(path, lang))[0] self.classes_ = np.load('%s/models/e%s_classes.npy' %(path, lang))[0] self.intercept_init_ = np.load('%s/models/e%s_intercept_init.npy' %(path, lang)) self.intercept_trans_ = np.load('%s/models/e%s_intercept_trans.npy' %(path, lang)) self.intercept_final_ = np.load('%s/models/e%s_intercept_final.npy' %(path, lang))
def __init__(self, lang): self.lookup = dict() self.n, self.tab, self.space = 4, '~~', '^^' self.con = wxConvert(order='wx2utf', lang=lang) lang = lang[0] path = os.path.abspath(__file__).rpartition('/')[0] sys.path.append(path) self.vec = np.load('%s/models/e%s_vec.npy' % (path, lang))[0] self.coef_ = np.load('%s/models/e%s_coef.npy' % (path, lang))[0] self.classes_ = np.load('%s/models/e%s_classes.npy' % (path, lang))[0] self.intercept_init_ = np.load('%s/models/e%s_intercept_init.npy' % (path, lang)) self.intercept_trans_ = np.load('%s/models/e%s_intercept_trans.npy' % (path, lang)) self.intercept_final_ = np.load('%s/models/e%s_intercept_final.npy' % (path, lang))
def __init__(self): self.n = 4 self.space = '^^' self.lookup = dict() self.con = wxConvert(order='wx2utf') path = os.path.abspath(__file__).rpartition('/')[0] self.clf = jl.load('%s/models/uh_sparse-clf' %path) self.vec = jl.load('%s/models/uh_sparse-vec' %path) self.range_ = set(range(int("0x0600", 16), int("0x06ff", 16))) try: with codecs.open('%s/extras/punkt.map' %path, 'r', 'utf-8') as punkt_fp: self.punkt = {line.split()[1]: line.split()[0] for line in punkt_fp} except IOError, e: print >> sys.stderr, e sys.exit(0)
def __init__(self): self.n = 4 self.space = '^^' self.lookup = dict() self.esc_char = chr(0) self.con = wxConvert(order='utf2wx') path = os.path.abspath(__file__).rpartition('/')[0] self.clf = jl.load('%s/models/hu_sparse-clf' %path) self.vec = jl.load('%s/models/hu_sparse-vec' %path) try: with codecs.open('%s/extras/punkt.map' %path, 'r', 'utf-8') as punkt_fp: self.punkt = {line.split()[0]: line.split()[1] for line in punkt_fp} except IOError, e: print >> sys.stderr, e sys.exit(0)
def fit(self): self.con = wxConvert(order='wx2utf', rmask=False) dist_dir = os.path.dirname(os.path.abspath(__file__)) #load models sys.path.append('%s/_utils' % dist_dir) self.coef_ = np.load('%s/models/uh_coef.npy' % dist_dir)[0] self.classes_ = np.load('%s/models/uh_classes.npy' % dist_dir)[0] self.vectorizer_ = np.load('%s/models/uh_sparse-vec.npy' % dist_dir)[0] self.intercept_init_ = np.load('%s/models/uh_intercept_init.npy' % dist_dir) self.intercept_trans_ = np.load('%s/models/uh_intercept_trans.npy' % dist_dir) self.intercept_final_ = np.load('%s/models/uh_intercept_final.npy' % dist_dir) #compile regexes self.non_alpha = re.compile( u'([^\u0621-\u063a\u0641-\u064a\u0674-\u06d3]+)') #initialize character maps self.ascii_letters = set(string.ascii_letters) self.letter_set = set([unichr(i) for i in range(int("0x0621", 16), int("0x063b", 16))]) | \ set([unichr(i) for i in range(int("0x0641", 16), int("0x064b", 16))]) | \ set([unichr(i) for i in range(int("0x0674", 16), int("0x06d4", 16))]) #initialize punctuation map table self.punkt_tbl = dict() with open('%s/mapping/punkt.map' % dist_dir) as punkt_fp: for line in punkt_fp: line = line.decode('utf-8') src, trg = line.split() self.punkt_tbl[ord(trg)] = src #initialize urdu normalization table self.canonical_eq = dict() with open('%s/mapping/urdu_urdu.map' % dist_dir) as nu_fp: for line in nu_fp: line = line.decode('utf-8') src, trg = line.split() self.canonical_eq[ord(src)] = trg
#!/usr/bin/env python from converter_indic import wxConvert con = wxConvert(order='wx2utf', lang='hin') hin = """rAmaH akRay""" print con.convert(hin)
#!/usr/bin/env python # -*- coding: utf-8 -*- import os, sys from converter_indic import wxConvert con = wxConvert(order='utf2wx') # here default language is hindi and default format is text hin = """देश भर में अब तक हुई बारिश औसत से छह फीसदी कम है जबकि विभाग का दावा था कि इसमें ५ फीसदी से ज्यादा कमी नहीं होगी""" print con.convert(hin)