Example #1
0
 def __init__(self,
              script='wx',
              raw='data/all_normalization_predictions_wx',
              rebuild=False):
     self.script = script
     self.raw = raw
     self.rebuild = rebuild
     self.pickled = './data/normdict.p'
     # converts wx2utf
     self.con = wxConvert(order='utf2wx', lang='hin')
     self.revcon = wxConvert(order='wx2utf', lang='hin')
     self.trn = transliterator(source='eng', target='hin')
     #rebuild dict
     if not os.path.exists(self.pickled) or self.rebuild:
         self.create_norm_pickle()
     self.normdict = pickle.load(open(self.pickled, 'rb'))
Example #2
0
    def fit(self):
        self.con = wxConvert(order='utf2wx', rmask=False)
        dist_dir = os.path.dirname(os.path.abspath(__file__))

        #load models
        sys.path.append('%s/_utils' %dist_dir)
        self.coef_            = np.load('%s/models/hu_coef.npy' %dist_dir)[0]
        self.classes_         = np.load('%s/models/hu_classes.npy' %dist_dir)[0]
        self.vectorizer_      = np.load('%s/models/hu_sparse-vec.npy' %dist_dir)[0]
        self.intercept_init_  = np.load('%s/models/hu_intercept_init.npy' %dist_dir)
        self.intercept_trans_ = np.load('%s/models/hu_intercept_trans.npy' %dist_dir)
        self.intercept_final_ = np.load('%s/models/hu_intercept_final.npy' %dist_dir)
        
        #initialize character maps  
        self.letters = set(string.ascii_letters)
        self.mask_roman = re.compile(r'([a-zA-Z]+)')
        self.rom_dev = re.compile(ur'([a-zA-Z])([\u0900-\u097f])')
        self.dev_rom = re.compile(ur'([\u0900-\u097f])([a-zA-Z])')
        self.non_alpha = re.compile(r"([^a-zA-Z%s]+)" %(self.esc_ch))

        #initialize punctuation map table
        self.punkt_tbl = dict()
        with open('%s/mapping/punkt.map' %dist_dir) as punkt_fp:
            for line in punkt_fp:
                line = line.decode('utf-8')
                s,t = line.split()
                if s in ["'", '"']: 
                    continue
                self.punkt_tbl[ord(s)] = t
def processInput(ifp, ofp, args, parser):
    convertor = wxConvert(order='wx2utf', lang='hin', format_='conll')
    sentences = ifp.read().split('\n\n')
    ifp.close()
    for sentence in sentences:
        if not sentence.strip():continue
        sentence = "\n".join(list(rawtoconll.toConll(sentence)))
        sentence = convertor.convert(sentence)
        out_parse = parser.getParse([sentence], sflag=False)
        if out_parse:
                ofp.write("%s\n\n" %(out_parse[0]))
Example #4
0
    def __init__(self, lang): 
        self.lookup = dict()
	self.n , self.tab, self.space = 4, '~~', '^^'
        self.con = wxConvert(order='wx2utf', lang=lang)
	lang = lang[0]
        path = os.path.abspath(__file__).rpartition('/')[0]
        sys.path.append(path)
        self.vec = np.load('%s/models/e%s_vec.npy' %(path, lang))[0]
	self.coef_ = np.load('%s/models/e%s_coef.npy' %(path, lang))[0]
	self.classes_ = np.load('%s/models/e%s_classes.npy' %(path, lang))[0]
        self.intercept_init_ = np.load('%s/models/e%s_intercept_init.npy' %(path, lang))
        self.intercept_trans_ = np.load('%s/models/e%s_intercept_trans.npy' %(path, lang))
        self.intercept_final_ = np.load('%s/models/e%s_intercept_final.npy' %(path, lang))
Example #5
0
 def __init__(self, lang):
     self.lookup = dict()
     self.n, self.tab, self.space = 4, '~~', '^^'
     self.con = wxConvert(order='wx2utf', lang=lang)
     lang = lang[0]
     path = os.path.abspath(__file__).rpartition('/')[0]
     sys.path.append(path)
     self.vec = np.load('%s/models/e%s_vec.npy' % (path, lang))[0]
     self.coef_ = np.load('%s/models/e%s_coef.npy' % (path, lang))[0]
     self.classes_ = np.load('%s/models/e%s_classes.npy' % (path, lang))[0]
     self.intercept_init_ = np.load('%s/models/e%s_intercept_init.npy' %
                                    (path, lang))
     self.intercept_trans_ = np.load('%s/models/e%s_intercept_trans.npy' %
                                     (path, lang))
     self.intercept_final_ = np.load('%s/models/e%s_intercept_final.npy' %
                                     (path, lang))
Example #6
0
    def __init__(self): 
        self.n = 4
	self.space = '^^'
        self.lookup = dict()
        self.con = wxConvert(order='wx2utf')
        path = os.path.abspath(__file__).rpartition('/')[0]
        self.clf = jl.load('%s/models/uh_sparse-clf' %path)
        self.vec = jl.load('%s/models/uh_sparse-vec' %path)
	self.range_ = set(range(int("0x0600", 16), int("0x06ff", 16)))

        try:
            with codecs.open('%s/extras/punkt.map' %path, 'r', 'utf-8') as punkt_fp: 
                self.punkt = {line.split()[1]: line.split()[0] for line in punkt_fp}
        except IOError, e:
            print >> sys.stderr, e
            sys.exit(0)
Example #7
0
    def __init__(self):        

        self.n = 4
	self.space = '^^'
        self.lookup = dict()
	self.esc_char = chr(0)
        self.con = wxConvert(order='utf2wx')
        path = os.path.abspath(__file__).rpartition('/')[0]
        self.clf = jl.load('%s/models/hu_sparse-clf' %path)
        self.vec = jl.load('%s/models/hu_sparse-vec' %path)

        try:
            with codecs.open('%s/extras/punkt.map' %path, 'r', 'utf-8') as punkt_fp: 
                self.punkt = {line.split()[0]: line.split()[1] for line in punkt_fp}
        except IOError, e:
            print >> sys.stderr, e
            sys.exit(0)
Example #8
0
    def fit(self):
        self.con = wxConvert(order='wx2utf', rmask=False)
        dist_dir = os.path.dirname(os.path.abspath(__file__))

        #load models
        sys.path.append('%s/_utils' % dist_dir)
        self.coef_ = np.load('%s/models/uh_coef.npy' % dist_dir)[0]
        self.classes_ = np.load('%s/models/uh_classes.npy' % dist_dir)[0]
        self.vectorizer_ = np.load('%s/models/uh_sparse-vec.npy' % dist_dir)[0]
        self.intercept_init_ = np.load('%s/models/uh_intercept_init.npy' %
                                       dist_dir)
        self.intercept_trans_ = np.load('%s/models/uh_intercept_trans.npy' %
                                        dist_dir)
        self.intercept_final_ = np.load('%s/models/uh_intercept_final.npy' %
                                        dist_dir)

        #compile regexes
        self.non_alpha = re.compile(
            u'([^\u0621-\u063a\u0641-\u064a\u0674-\u06d3]+)')

        #initialize character maps
        self.ascii_letters = set(string.ascii_letters)
        self.letter_set = set([unichr(i) for i in range(int("0x0621", 16), int("0x063b", 16))]) | \
                          set([unichr(i) for i in range(int("0x0641", 16), int("0x064b", 16))]) | \
                          set([unichr(i) for i in range(int("0x0674", 16), int("0x06d4", 16))])

        #initialize punctuation map table
        self.punkt_tbl = dict()
        with open('%s/mapping/punkt.map' % dist_dir) as punkt_fp:
            for line in punkt_fp:
                line = line.decode('utf-8')
                src, trg = line.split()
                self.punkt_tbl[ord(trg)] = src

        #initialize urdu normalization table
        self.canonical_eq = dict()
        with open('%s/mapping/urdu_urdu.map' % dist_dir) as nu_fp:
            for line in nu_fp:
                line = line.decode('utf-8')
                src, trg = line.split()
                self.canonical_eq[ord(src)] = trg
#!/usr/bin/env python
from converter_indic import wxConvert
con = wxConvert(order='wx2utf', lang='hin')
hin = """rAmaH akRay"""
print con.convert(hin)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys
from converter_indic import wxConvert
con = wxConvert(order='utf2wx')  # here default language is hindi and default format is text
hin = """देश भर में अब तक हुई बारिश औसत से छह फीसदी कम है जबकि विभाग का दावा था कि इसमें ५ फीसदी से ज्यादा कमी नहीं होगी"""
print con.convert(hin)