Python Transliteratorの例、indictrans.Transliterator Pythonの例

コード例 #1

0

ファイルを表示

def predict():
    
    if request.method == 'POST':
        print(request.form.get('NewYork'))
        try:

            NewYork = request.form['NewYork']
            California = request.form['California']
            Florida = request.form['Florida']
            NewYorkstrip = NewYork.strip()
            NewYorkLower = NewYorkstrip.lower()
  
            f = open("data/Alldata.csv", "r")
            s = f.read()
            dic = ast.literal_eval(s)
            my_list = []
            words = NewYorkLower.split() 
            
            for c in words:
                trn = Transliterator(source=California.strip(), target=Florida.strip(), build_lookup=True)
                eng = trn.transform(c.lower())
                my_list.append(dic.get(c,eng)) 
                a = my_list
                
            listToStr = ' '.join(map(str, a)) 
            

        except ValueError:
            return "Please check if the values are entered correctly"
    return render_template('home.html', prediction = listToStr)

コード例 #2

0

ファイルを表示

ファイル: app.py プロジェクト: CivicDataLab/transliteration_api

def transliterate():
    word = request.args.get('word', default="congress", type=str)
    trn = Transliterator(source='eng',
                         target='hin',
                         build_lookup=True,
                         decode='beamsearch')
    best_transliterated_list = trn.transform(word, k_best=5)
    return {"transliteration": best_transliterated_list}

コード例 #3

0

ファイルを表示

ファイル: datasets.py プロジェクト: murali1996/CodemixedNLP

def read_kumaretal_2019_agg_downloads(path, mode, romanize=False):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    n_trimmed, n_romanized = 0, 0
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []
    lines = read_csv_file(path, has_header=False)
    for i, line in enumerate(lines):
        uid, txt, label = line[0], line[1], line[2]
        if not txt:
            continue
        if romanize:
            new_txt = trn_hin2eng.transform(txt)
            if txt != new_txt:
                n_romanized += 1
            txt = new_txt
        new_txt = clean_generic(txt)
        if new_txt.strip() == "":
            new_txt = txt
        if len(new_txt) > MAX_CHAR_LEN:
            n_trimmed += 1
            newtokens, currsum = [], 0
            for tkn in new_txt.split():  # 1 for space
                if currsum + len(tkn) + 1 <= MAX_CHAR_LEN:
                    newtokens.append(tkn)
                    currsum += len(tkn) + 1
                else:
                    break
            new_txt = " ".join(newtokens)
        example = Example(dataset="kumaretal_2019_agg",
                          task="classification",
                          split_type=mode,
                          uid=uid,
                          text=txt,
                          label=label,
                          text_pp=new_txt)
        examples.append(example)
        progress_bar(len(examples), len(lines), ["time"], [time() - st_time])

    if romanize:
        print(
            f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed} "
            f"and # of romanized instances: {n_romanized}")
    else:
        print(
            f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
        )

    return examples

コード例 #4

0

ファイルを表示

 def test_ind2ru(self):
     """Test Indic-to-[Roman, Urdu] ML models"""
     for lang_pair in self.src2trg:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, src, trg),
                      encoding='utf-8') as fp:
             for line in fp:
                 word, expected = line.split()
                 self.assertEqual(trans.transform(word), expected)

コード例 #5

0

ファイルを表示

 def test_ru2ind(self):
     """Test [Roman, Urdu]-to-Indic ML models"""
     for lang_pair in self.trg2src:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, trg, src),
                      encoding='utf-8') as fp:
             for line in fp:
                 expected, word = line.split()
                 self.assertEqual(trans.transform(word), expected)

コード例 #6

0

ファイルを表示

ファイル: test_trans.py プロジェクト: irshadbhat/indic-trans

 def test_ru2ind(self):
     """Test [Roman, Urdu]-to-Indic ML models"""
     for lang_pair in self.trg2src:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, trg, src),
                      encoding='utf-8') as fp:
             for line in fp:
                 expected, word = line.split()
                 self.assertEqual(trans.transform(word), expected)

コード例 #7

0

ファイルを表示

ファイル: test_trans.py プロジェクト: irshadbhat/indic-trans

 def test_ind2ru(self):
     """Test Indic-to-[Roman, Urdu] ML models"""
     for lang_pair in self.src2trg:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, src, trg),
                      encoding='utf-8') as fp:
             for line in fp:
                 word, expected = line.split()
                 self.assertEqual(trans.transform(word), expected)

コード例 #8

0

ファイルを表示

ファイル: vida.py プロジェクト: monkfromearth/vida

 def __init__(self, text, source, target):
     self.text = text
     self.source = source
     self.target = target
     self.isEngSource = (self.source == 'eng')
     if self.isEngSource:
         self.usdictionary = enchant.Dict("en_US")
         self.gbdictionary = enchant.Dict("en_GB")
     self.validated = self.source in self.codes and self.target in self.codes
     if self.validated:
         self.engine = Transliterator(source=self.source,
                                      target=self.target)

コード例 #9

0

ファイルを表示

ファイル: app.py プロジェクト: GokulNC/English-HindUrdu-Parallel-Translator

def get_converters():
    from indictrans import Transliterator
    converters = {
        'hi2ur': Transliterator(source='hin', target='urd',
                                rb=False),  #, build_lookup=True),
        'ur2hi': Transliterator(source='urd', target='hin',
                                rb=False),  #, build_lookup=True),
        'hi2en': Transliterator(source='hin', target='eng',
                                rb=False),  #, build_lookup=True),
        'ur2en': Transliterator(source='urd', target='eng',
                                rb=False),  #, build_lookup=True),
    }
    return converters

コード例 #10

0

ファイルを表示

ファイル: test_trans.py プロジェクト: irshadbhat/indic-trans

 def test_kbest(self):
     """Make sure `k-best` works without failure"""
     k_best = range(2, 15)
     r2i = Transliterator(source='eng',
                          target='hin',
                          decode='beamsearch')
     i2r = Transliterator(source='hin',
                          target='eng',
                          decode='beamsearch')
     for k in k_best:
         hin = r2i.transform('indictrans', k_best=k)
         eng = i2r.transform(hin[0], k_best=k)
         self.assertTrue(len(hin) == k)
         self.assertTrue(len(eng) == k)

コード例 #11

0

ファイルを表示

def get_converters():
    from google_trans_new import google_translator
    from indictrans import Transliterator
    converters = {
        'g_translator': google_translator(url_suffix="com.pk"),
        'hi2ur': Transliterator(source='hin', target='urd',
                                rb=False),  #, build_lookup=True),
        'ur2hi': Transliterator(source='urd', target='hin',
                                rb=False),  #, build_lookup=True),
        'hi2en': Transliterator(source='hin', target='eng',
                                rb=False),  #, build_lookup=True),
        'ur2en': Transliterator(source='urd', target='eng',
                                rb=False),  #, build_lookup=True),
    }
    return converters

コード例 #12

0

ファイルを表示

class IndictransTransliterator:
    def __init__(self):
        self.trn = Transliterator(source='hin',
                                  target='eng',
                                  decode='beamsearch',
                                  build_lookup=True)
        self.trans_dict = {}

    def transliterate(self, original):
        transliterations = self.get_all(original)
        return random.choice(transliterations)

    def get_all(self, original):
        if original in self.trans_dict:
            return self.trans_dict[original]
        else:
            transliterations = self.trn.transform(original, k_best=5)
            self.trans_dict[original] = transliterations
            return transliterations

    @staticmethod
    def _is_deva(unicode_tok):
        """Returns True if |unicode_tok| contains a Devanagari character"""
        for c in unicode_tok:
            if int('0900', 16) <= ord(c) <= int('097f', 16):
                return True
        return False

コード例 #13

0

ファイルを表示

ファイル: datasets.py プロジェクト: murali1996/CodemixedNLP

def read_iitp_product_reviews_hi_sa_downloads(path, mode):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    n_trimmed = 0
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []
    lines = [line.strip() for line in open(path, "r")]
    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue
        vals = line.split(",")
        label = vals[0]
        txt = ",".join(vals[1:])
        txt = trn_hin2eng.transform(txt)
        new_txt = "".join([char for char in txt])
        if len(new_txt) > MAX_CHAR_LEN:
            n_trimmed += 1
            newtokens, currsum = [], 0
            for tkn in new_txt.split():  # 1 for space
                if currsum + len(tkn) + 1 <= MAX_CHAR_LEN:
                    newtokens.append(tkn)
                    currsum += len(tkn) + 1
                else:
                    break
            new_txt = " ".join(newtokens)
        example = Example(dataset="iitp_product_reviews_hi_sa",
                          task="classification",
                          split_type=mode,
                          uid=len(examples),
                          text=txt,
                          label=label,
                          text_pp=new_txt)
        examples.append(example)
        progress_bar(len(examples), len(lines), ["time"], [time() - st_time])
    print(
        f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
    )
    return examples

コード例 #14

0

ファイルを表示

ファイル: pipeline_covid.py プロジェクト: shrinidhikr/Vernacular-Code-Switching-in-Intelligent-Assistant

def perform_action(query):
    query = str(query).lower()
    display_map = {}

    kan_trans = Transliterator(source="kan", target="eng", build_lookup=True)
    hin_trans = Transliterator(source="hin", target="eng", build_lookup=True)
    query = transliterate(query, hin_trans, kan_trans)
    if get_lang_cs(query, lit) == "en_hi":
        intent = load_and_query_classifier(query)
    if get_lang_cs(query, lit) == "en_ka":

        intent = kannada_load_and_query_classifier(query)

    #print(intent)
    #print ("INTENT identified:"+" "+intent)
    ner_type = ["HOTEL", "RESTAURANT", "TRAVEL_BOOKING", "REMINDER"]

    #    flag = 1

    for kind in ner_type:
        if kind == intent:
            return ner_module.response(query, intent)
    if get_lang_cs(query, lit) == "en_hi":
        #print "This is a query of type:en_hi"
        word_map = preprocess_pipeline(query)
    elif get_lang_cs(query, lit) == "en_ka":
        #print "This is a query of type:en_ka"
        word_map = keyword_extract(query)
    if intent in ["SYMPTOMS", "TREATMENT", "PREVENTION"]:
        if query.find("corona") >= 0 or query.find("covid") >= 0 or query.find(
                "crna") >= 0:
            if intent == "SYMPTOMS":
                return covid19.symptoms
            elif intent == "TREATMENT":
                return covid19.treatment
            elif intent == "PREVENTION":
                return covid19.prevention
    #print(word_map)
    display_map["code_switch_type"] = get_lang_cs(query, lit)
    display_map["transliterated"] = query
    display_map["intent"] = intent

    return "Response:" + "\n" + str(display_map) + "\n" + google_search(
        query, word_map, intent)

コード例 #15

0

ファイルを表示

ファイル: datasets.py プロジェクト: murali1996/CodemixedNLP

def read_hinglishpedia_downloads(path1, path2, mode, standardizing_tags={}):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    n_trimmed = 0
    FIELDS += [
        fieldname for fieldname in [
            "tgt",
        ] if fieldname not in FIELDS
    ]
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    txt_lines = [line.strip() for line in open(path1, "r")]
    tag_lines = [line.strip() for line in open(path2, "r")]

    for i, (txt, tags) in tqdm(enumerate(zip(txt_lines, tag_lines))):
        if not txt:
            continue
        txt = trn_hin2eng.transform(txt)
        example = Example(dataset="hinglishpedia",
                          task="classification",
                          split_type=mode,
                          uid=len(examples),
                          text=txt,
                          langids=" ".join([
                              standardizing_tags[lid]
                              if lid in standardizing_tags else "other"
                              for lid in tags.split()
                          ]),
                          text_pp=txt)
        examples.append(example)
        # progress_bar(len(examples), len(txt_lines), ["time"], [time() - st_time])
    print(
        f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
    )
    return examples

コード例 #16

0

ファイルを表示

def get_transliteration(vocab, headers):
    trans = {}
    if headers is None:
        trn = Transliterator(source='eng', target='hin', build_lookup=True)
        trans = {item: trn.transform(item) for item in vocab}
    else:
        base_url = 'https://api.cognitive.microsofttranslator.com'
        path = '/transliterate?api-version=3.0&language=hi&fromScript=Latn&toScript=Deva'
        count = 0
        body = []
        constructed_url = base_url + path
        query = ''
        while (count <= 6500):
            for i in range(count, (count + 500), 50):
                for j in range(i, i + 50):
                    query += vocab[j] + ' '
                body.append({'text': query.strip()})
                query = ''
            response = requests.post(constructed_url,
                                     headers=headers,
                                     json=body)
            result = response.json()
            for j, i in enumerate(result):
                trans.update({body[j]['text']: i['text']})
            body = []
            count += 500

        for i in range(count, len(vocab), 50):
            for j in range(i, i + 50):
                if j < len(vocab):
                    query += vocab[j] + ' '
            body.append({'text': query.strip()})
            query = ''
        response = requests.post(constructed_url, headers=headers, json=body)
        result = response.json()
        for j, i in enumerate(result):
            trans.update({body[j]['text']: i['text']})

    return trans

コード例 #17

0

ファイルを表示

 def __init__(self, lang='ta'):
     self.lang = lang
     self.normalizer = BaseNormalizer(lang)
     # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
     # detect_lang_and_store in feature_utils.py
     self.lmap = self.load_language_maps(
         os.path.join(os.path.dirname(sys.path[0]),
                      '../resources/data/alltextslang.txt'))
     self.soundexer = Soundex()
     self.ta_trans = Transliterator(source='eng',
                                    target='tam',
                                    build_lookup=True)
     self.ml_trans = Transliterator(source='eng',
                                    target='mal',
                                    build_lookup=True)
     self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                               prefix_length=7)
     self.sym_spell.load_dictionary(
         '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
         term_index=0,
         count_index=1,
         separator="\t")
     super().__init__()

コード例 #18

0

ファイルを表示

 def test_kbest(self):
     """Make sure `k-best` works without failure"""
     k_best = range(2, 15)
     r2i = Transliterator(source='eng', target='hin', decode='beamsearch')
     i2r = Transliterator(source='hin', target='eng', decode='beamsearch')
     for k in k_best:
         hin = r2i.transform('indictrans', k_best=k)
         eng = i2r.transform(hin[0], k_best=k)
         self.assertTrue(len(hin) == k)
         self.assertTrue(len(eng) == k)

コード例 #19

0

ファイルを表示

 def test_rtrans(self):
     """Test Indic-to-Indic ML and Rule-Based models."""
     with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp:
         # first line contains language codes
         lang_codes = fp.readline().split()
         lang2word = dict(
             zip(lang_codes, [[] for i in range(len(lang_codes))]))
         for line in fp:
             line = line.split()
             for i, word in enumerate(line):
                 lang2word[lang_codes[i]].append(word)
     for src in lang_codes:
         for trg in lang_codes:
             if src == trg:
                 continue
             s2t_ml = Transliterator(source=src, target=trg, rb=False)
             s2t_rb = Transliterator(source=src, target=trg, rb=True)
             for word in lang2word[src]:
                 s2t_ml.transform(word)
                 s2t_rb.transform(word)

コード例 #20

0

ファイルを表示

def start_lsh():

	create = True 
	sqlite_file = "/home/hkesavam/new_lid/code/gen_data/db_lid"
	conn = sqlite3.connect(sqlite_file)
	cur = conn.cursor()
	data = cur.execute("select distinct(soundex) from dev_table where count > 5")
	data = list(data)

	 

	#soundex_dict = pickle.load(open("/home/hkesavam/new_lid/code/gen_data/data_dev/dev_to_soundex.pkl", "r"))
	#data =  soundex_dict.keys()
	#data = ["e16512", "e16532", "hello", "hell"]
	lsh = MinHashLSH(threshold=0.5, num_perm=32)# Create MinHash objects
	minhashes = {}
	tot_wr_count = 0
	if create:
		for c, i in enumerate(data):
		  minhash = MinHash(num_perm=32)
		  for d in i[0]:
		    try:
		      d = d.encode("utf-8")
		      minhash.update(d)
		    except:
		      print "Enterin continue"
		      continue
		  lsh.insert(c, minhash)
		  minhashes[c] = minhash
		print len(data)
		print("Dumping")
		#cPickle.dump(new_data, open("data", "wb"), -1)
		cPickle.dump(lsh, open("/home/hkesavam/new_lid/code/gen_data/lsh_model", "wb"), -1)
		print "Finished dumping"

	if not(create):
		data = cPickle.load( open("/home/hkesavam/new_lid/code/gen_data/data", "rb"))
		lsh = cPickle.load( open("/home/hkesavam/new_lid/code/gen_data/lsh_model", "rb"))
	hin_soundex_inst = Soundex()
	hin_trans = Transliterator(source='eng', target='hin', build_lookup=True)
	return cur, data, lsh, hin_soundex_inst, hin_trans

コード例 #21

0

ファイルを表示

ファイル: test_trans.py プロジェクト: irshadbhat/indic-trans

 def test_rtrans(self):
     """Test Indic-to-Indic ML and Rule-Based models."""
     with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp:
         # first line contains language codes
         lang_codes = fp.readline().split()
         lang2word = dict(zip(lang_codes,
                              [[] for i in range(len(lang_codes))]))
         for line in fp:
             line = line.split()
             for i, word in enumerate(line):
                 lang2word[lang_codes[i]].append(word)
     for src in lang_codes:
         for trg in lang_codes:
             if src == trg:
                 continue
             s2t_ml = Transliterator(source=src, target=trg, rb=False)
             s2t_rb = Transliterator(source=src, target=trg, rb=True)
             for word in lang2word[src]:
                 s2t_ml.transform(word)
                 s2t_rb.transform(word)

コード例 #22

0

ファイルを表示

ファイル: trans.py プロジェクト: SilentFlame/AggressionDetection

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import csv
from indictrans import Transliterator

trn = Transliterator(source='hin', target='eng', build_lookup=True)

# print eng

with open('dataSet.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        hin = row[1].decode('utf-8').replace('\n', " ")
        eng = trn.transform(hin)
        print row[0], eng.encode('unicode-escape'), row[2]

コード例 #23

0

ファイルを表示

ファイル: t.py プロジェクト: agoel00/code-mix-mftma

from indictrans import Transliterator
vocab = []

with open('all_roman.txt', 'r') as infile:
    con = infile.readlines()

vocab = [x.strip('\n') for x in con]

trn = Transliterator(source='eng', target='hin')

with open('transliterations.txt', 'w+') as outfile:
    for word in vocab:
        deva = trn.transform(word)
        outfile.write(word + "\t" + deva + "\n")

コード例 #24

0

ファイルを表示

class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract review text, emojis and emoji sentiment.

    Takes a sequence of strings and produces a dict of values.  Keys are
    `review`, `emojis`, and `emoji-sentiment`.
    """
    def __init__(self, lang='ta'):
        self.lang = lang
        self.normalizer = BaseNormalizer(lang)
        # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
        # detect_lang_and_store in feature_utils.py
        self.lmap = self.load_language_maps(
            os.path.join(os.path.dirname(sys.path[0]),
                         '../resources/data/alltextslang.txt'))
        self.soundexer = Soundex()
        self.ta_trans = Transliterator(source='eng',
                                       target='tam',
                                       build_lookup=True)
        self.ml_trans = Transliterator(source='eng',
                                       target='mal',
                                       build_lookup=True)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell.load_dictionary(
            '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
            term_index=0,
            count_index=1,
            separator="\t")
        super().__init__()

    def load_language_maps(self, mapfile):
        lmap = {}
        with open(mapfile, 'r') as mapf:
            for line in mapf:
                text, lang, conf = line.rstrip().split('\t')
                lmap[text] = (lang, float(conf))
        return lmap

    def get_language_tag(self, text):
        return self.lmap.get(text, ('unknown', 0.0))

    def fit(self, x, y=None):
        return self

    def transform(self, reviews):
        features = np.recarray(
            shape=(len(reviews), ),
            dtype=[
                ('review', object),
                ('emojis', object),
                ('emoji_sentiment', object),
                ('lang_tag', object),
                ('len_range', object),
                ('soundexes', object),
            ],
        )
        for i, review in enumerate(reviews):
            features['review'][i] = self.normalizer.normalize(text=review)

            emojis, sentiment = get_emojis_from_text(review)
            features['emojis'][i] = ' '.join(emojis)
            features['emoji_sentiment'][i] = sentiment

            lang, conf = self.get_language_tag(review.strip())
            if lang == self.lang or lang == (self.lang + 'en'):
                # google agrees with some confidence
                agreement = 1
            elif conf < 0.5:
                # google says not-tamil, but weakly
                agreement = 0.5
            else:
                # google clearly says not-tamil
                agreement = 0
            features['lang_tag'][i] = {'lang': lang, 'agreement': agreement}
            features['len_range'][i] = get_doc_len_range(review)
            if self.lang == 'ta':
                review_trans = self.ta_trans.transform(review)
                for word in review_trans.split():
                    suggestions = self.sym_spell.lookup(word,
                                                        Verbosity.CLOSEST,
                                                        max_edit_distance=2,
                                                        include_unknown=True)
                    if len(suggestions) > 0 and suggestions[0].distance < 3:
                        print(word, suggestions[0].term)
                        # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity
            elif self.lang == 'ml':
                review_trans = self.ml_trans.transform(review)
            else:
                review_trans = review
            # TODO: introduce spell correct here for added normalisation
            # print(lang, review_trans)
            features['soundexes'][i] = ' '.join([
                self.soundexer.soundex(word) for word in review_trans.split()
            ])
        return features

コード例 #25

0

ファイルを表示

from sklearn.metrics import classification_report

from libindic.soundex import Soundex

from lib.feature_utils import load_docs, get_emojis_from_text, get_doc_len_range
sys.path.append(
    os.path.join(os.path.dirname(sys.path[0]), 'extern', 'indic_nlp_library'))
from indicnlp.normalize.indic_normalize import BaseNormalizer
try:
    from indictrans import Transliterator
except ImportError:
    print(
        'Please install indic-trans from git: https://github.com/libindic/indic-trans'
    )

ta_trans = Transliterator(source='eng', target='tam', build_lookup=True)
ml_trans = Transliterator(source='eng', target='mal', build_lookup=True)
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each review.
MAX_SEQUENCE_LENGTH = 150
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      lower=True)
soundexer = Soundex()


def load_language_maps(mapfile):
    lmap = {}

コード例 #26

0

ファイルを表示

# -*- coding: utf-8 -*-
# @Author: claravania
# @Date:   2018-10-15 11:03:20
# @Last Modified by:   claravania
# @Last Modified time: 2018-11-08 17:22:37

import codecs
import os

from indictrans import Transliterator

ud_dir = '../data/ud-treebanks-v2.2'
hi_tb = 'UD_Urdu-UDTB'

trn = Transliterator(source='urd', target='eng')
filenames = os.listdir(os.path.join(ud_dir, hi_tb))

for f in filenames:
    if f.endswith('.conllu') or f.endswith('.conllu.sample'):

        fname = os.path.join(ud_dir, hi_tb, f)
        print 'Reading ' + fname

        ftrn = fname + '.en'
        fout = codecs.open(ftrn, 'w', encoding='utf-8')

        count = 0
        with codecs.open(fname, encoding='utf-8') as fin:
            for line in fin:
                count += 1

コード例 #27

0

ファイルを表示

ファイル: vida.py プロジェクト: monkfromearth/vida

class Vida:
    """
	Only focused on English to 15 Indic Language Transliteration
	Hindi (hin)
	Bengali (ben)
	Gujarati (guj)
	Punjabi (pun)
	Malayalam (mal)
	Kannada (kan)
	Tamil (tam)
	Telugu (tel)
	Oriya (ori)
	Marathi (mar)
	Assamese (ass)
	Konkani (kon)
	Bodo (bod)
	Nepali (nep)
	Urdu (urd)
	English (eng)
	"""

    languages = {
        "hin": "Hindi",
        "ben": "Bengali",
        "guj": "Gujarati",
        "pun": "Punjabi",
        "mal": "Malayalam",
        "kan": "Kannada",
        "tam": "Tamil",
        "tel": "Telugu",
        "ori": "Oriya",
        "mar": "Marathi",
        "ass": "Assamese",
        "kon": "Konkani",
        "bod": "Bodo",
        "nep": "Nepali",
        "urd": "Urdu",
        "eng": "English"
    }

    codes = languages.keys()

    @staticmethod
    def is_ascii(s):
        return all(ord(c) < 128 for c in s)

    def __init__(self, text, source, target):
        self.text = text
        self.source = source
        self.target = target
        self.isEngSource = (self.source == 'eng')
        if self.isEngSource:
            self.usdictionary = enchant.Dict("en_US")
            self.gbdictionary = enchant.Dict("en_GB")
        self.validated = self.source in self.codes and self.target in self.codes
        if self.validated:
            self.engine = Transliterator(source=self.source,
                                         target=self.target)

    def run(self):
        status = False
        message = "Couldn't transliterate the text."
        content = {}
        output = []
        if not self.validated:
            message = "Please provide languages and their code."
            output = self.text
        else:
            text = self.text.split()
            try:
                for index in xrange(len(text)):
                    word = text[index]
                    if not self.isEngSource:
                        word = word.decode('utf-8')
                        output.insert(
                            index,
                            self.engine.transform(word).encode('utf-8'))
                    else:
                        if not Vida.is_ascii(word): word = word.decode('utf-8')
                        if not self.usdictionary.check(
                                word) and not self.gbdictionary.check(word):
                            output.insert(
                                index,
                                self.engine.transform(word).encode('utf-8'))
                        else:
                            output.insert(index, word)
                status = True
                message = "Succesfully transliterated the code."
            except UnicodeDecodeError, e:
                Repo.exception(e)
                message = "Couldn't decode the language properly."
            except IndexError, e:
                Repo.exception(e)
                message = "Couldn't properly frame the sentence."
            output = ' '.join(output)

コード例 #28

0

ファイルを表示

ファイル: baseline.py プロジェクト: kelseyball/cs-transliterated-pos-tagging

import io
import re
import sys
import math
import string
import random
import pickle
from argparse import ArgumentParser
from collections import Counter, defaultdict

import dynet as dy
import numpy as np
from gensim.models.word2vec import KeyedVectors
from indictrans import Transliterator

trn = Transliterator(source='eng', target='hin', build_lookup=True)


def is_lang_dist(dist_string):
    return ":" in dist_string


def get_lang_dist(dist_string):
    dist = dict()
    pairs = dist_string.split(',')
    for p in pairs:
        lang, prob = p.split(':')
        dist[lang] = prob
    return dist

コード例 #29

0

ファイルを表示

ファイル: transliterator.py プロジェクト: milind97/hinglish-transliterator

    # If transliterated word is not a legitimate hindi word, add the closest hindi match to the predictions list
    if hindi_match:
        suggested_words.append([hindi_match[0], 1])

    return suggested_words


if __name__ == '__main__':
    # sample test script to test this program logic
    # initialize english and hindi dictionary object
    eng_dict = enchant.Dict('en_US')
    hin_dict = enchant.Dict('hi_IN')

    # initialize objects for language classifier and indictrans class
    trn = Transliterator(source='eng', target='hin')
    classifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

    # take sample sentence from the user
    sentence = list(input().split())

    # transliterate every word in the sentence
    for word in sentence:
        print(transliterate_in_hindi(trn, word, eng_dict, hin_dict, classifier, 0))

コード例 #30

0

ファイルを表示

def main():
    from indictrans import Transliterator
    trn = Transliterator(source='eng', target='tel', build_lookup=True)
    file = open("DATASET/datasetforfinal.txt", "r")
    file_data = file.readlines()
    print(len(file_data))

コード例 #31

0

ファイルを表示

ファイル: test.py プロジェクト: loretoparisi/indic-trans

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from indictrans import Transliterator
from polyglot_tokenizer import Tokenizer

flag = True
s = 'hin'
t = 'eng'

forward_transl_full = Transliterator(source=s, target=t, build_lookup=True)

forward_transl_token = Transliterator(source=s, target=t, decode='beamsearch')
back_transl_token = Transliterator(source=t, target=s, build_lookup=True)

tk = Tokenizer(lang=s[:2])
tk_back = Tokenizer(lang=t[:2])

l = u"रज्ज के रुलाया"  #\nरज्ज के हंसाया\n\nमैंने दिल खो' के इश्क़ कमाया\n"

l = l.lower().strip()

lines = l.split("\n")
print(lines)

output = []
if flag == True:
    for l in lines:
        json = {}

コード例 #32

0

ファイルを表示

ファイル: Transliterator.py プロジェクト: imoizuddin/MorphAnaylzer

 def code_transliterate(self):
     trn = Transliterator(source='hin', target='eng', build_lookup=True)
     eng = trn.transform(self)
     return eng

コード例 #33

0

ファイルを表示

 def __init__(self):
     self.trn = Transliterator(source='hin',
                               target='eng',
                               decode='beamsearch',
                               build_lookup=True)
     self.trans_dict = {}