def predict(): if request.method == 'POST': print(request.form.get('NewYork')) try: NewYork = request.form['NewYork'] California = request.form['California'] Florida = request.form['Florida'] NewYorkstrip = NewYork.strip() NewYorkLower = NewYorkstrip.lower() f = open("data/Alldata.csv", "r") s = f.read() dic = ast.literal_eval(s) my_list = [] words = NewYorkLower.split() for c in words: trn = Transliterator(source=California.strip(), target=Florida.strip(), build_lookup=True) eng = trn.transform(c.lower()) my_list.append(dic.get(c,eng)) a = my_list listToStr = ' '.join(map(str, a)) except ValueError: return "Please check if the values are entered correctly" return render_template('home.html', prediction = listToStr)
def transliterate(): word = request.args.get('word', default="congress", type=str) trn = Transliterator(source='eng', target='hin', build_lookup=True, decode='beamsearch') best_transliterated_list = trn.transform(word, k_best=5) return {"transliteration": best_transliterated_list}
def read_kumaretal_2019_agg_downloads(path, mode, romanize=False): st_time = time() global FIELDS, MAX_CHAR_LEN from indictrans import Transliterator trn_hin2eng = Transliterator(source='hin', target='eng') n_trimmed, n_romanized = 0, 0 Example = namedtuple(f"{mode}_example", FIELDS, defaults=(None, ) * len(FIELDS)) examples = [] lines = read_csv_file(path, has_header=False) for i, line in enumerate(lines): uid, txt, label = line[0], line[1], line[2] if not txt: continue if romanize: new_txt = trn_hin2eng.transform(txt) if txt != new_txt: n_romanized += 1 txt = new_txt new_txt = clean_generic(txt) if new_txt.strip() == "": new_txt = txt if len(new_txt) > MAX_CHAR_LEN: n_trimmed += 1 newtokens, currsum = [], 0 for tkn in new_txt.split(): # 1 for space if currsum + len(tkn) + 1 <= MAX_CHAR_LEN: newtokens.append(tkn) currsum += len(tkn) + 1 else: break new_txt = " ".join(newtokens) example = Example(dataset="kumaretal_2019_agg", task="classification", split_type=mode, uid=uid, text=txt, label=label, text_pp=new_txt) examples.append(example) progress_bar(len(examples), len(lines), ["time"], [time() - st_time]) if romanize: print( f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed} " f"and # of romanized instances: {n_romanized}") else: print( f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}" ) return examples
def test_ind2ru(self): """Test Indic-to-[Roman, Urdu] ML models""" for lang_pair in self.src2trg: src = lang_pair[0] trg = lang_pair[1] trans = Transliterator(source=src, target=trg) with io.open('%s/%s_%s.testpairs' % (self.test_dir, src, trg), encoding='utf-8') as fp: for line in fp: word, expected = line.split() self.assertEqual(trans.transform(word), expected)
def test_ru2ind(self): """Test [Roman, Urdu]-to-Indic ML models""" for lang_pair in self.trg2src: src = lang_pair[0] trg = lang_pair[1] trans = Transliterator(source=src, target=trg) with io.open('%s/%s_%s.testpairs' % (self.test_dir, trg, src), encoding='utf-8') as fp: for line in fp: expected, word = line.split() self.assertEqual(trans.transform(word), expected)
def __init__(self, text, source, target): self.text = text self.source = source self.target = target self.isEngSource = (self.source == 'eng') if self.isEngSource: self.usdictionary = enchant.Dict("en_US") self.gbdictionary = enchant.Dict("en_GB") self.validated = self.source in self.codes and self.target in self.codes if self.validated: self.engine = Transliterator(source=self.source, target=self.target)
def get_converters(): from indictrans import Transliterator converters = { 'hi2ur': Transliterator(source='hin', target='urd', rb=False), #, build_lookup=True), 'ur2hi': Transliterator(source='urd', target='hin', rb=False), #, build_lookup=True), 'hi2en': Transliterator(source='hin', target='eng', rb=False), #, build_lookup=True), 'ur2en': Transliterator(source='urd', target='eng', rb=False), #, build_lookup=True), } return converters
def test_kbest(self): """Make sure `k-best` works without failure""" k_best = range(2, 15) r2i = Transliterator(source='eng', target='hin', decode='beamsearch') i2r = Transliterator(source='hin', target='eng', decode='beamsearch') for k in k_best: hin = r2i.transform('indictrans', k_best=k) eng = i2r.transform(hin[0], k_best=k) self.assertTrue(len(hin) == k) self.assertTrue(len(eng) == k)
def get_converters(): from google_trans_new import google_translator from indictrans import Transliterator converters = { 'g_translator': google_translator(url_suffix="com.pk"), 'hi2ur': Transliterator(source='hin', target='urd', rb=False), #, build_lookup=True), 'ur2hi': Transliterator(source='urd', target='hin', rb=False), #, build_lookup=True), 'hi2en': Transliterator(source='hin', target='eng', rb=False), #, build_lookup=True), 'ur2en': Transliterator(source='urd', target='eng', rb=False), #, build_lookup=True), } return converters
class IndictransTransliterator: def __init__(self): self.trn = Transliterator(source='hin', target='eng', decode='beamsearch', build_lookup=True) self.trans_dict = {} def transliterate(self, original): transliterations = self.get_all(original) return random.choice(transliterations) def get_all(self, original): if original in self.trans_dict: return self.trans_dict[original] else: transliterations = self.trn.transform(original, k_best=5) self.trans_dict[original] = transliterations return transliterations @staticmethod def _is_deva(unicode_tok): """Returns True if |unicode_tok| contains a Devanagari character""" for c in unicode_tok: if int('0900', 16) <= ord(c) <= int('097f', 16): return True return False
def read_iitp_product_reviews_hi_sa_downloads(path, mode): st_time = time() global FIELDS, MAX_CHAR_LEN from indictrans import Transliterator trn_hin2eng = Transliterator(source='hin', target='eng') n_trimmed = 0 Example = namedtuple(f"{mode}_example", FIELDS, defaults=(None, ) * len(FIELDS)) examples = [] lines = [line.strip() for line in open(path, "r")] for i, line in enumerate(lines): line = line.strip() if not line: continue vals = line.split(",") label = vals[0] txt = ",".join(vals[1:]) txt = trn_hin2eng.transform(txt) new_txt = "".join([char for char in txt]) if len(new_txt) > MAX_CHAR_LEN: n_trimmed += 1 newtokens, currsum = [], 0 for tkn in new_txt.split(): # 1 for space if currsum + len(tkn) + 1 <= MAX_CHAR_LEN: newtokens.append(tkn) currsum += len(tkn) + 1 else: break new_txt = " ".join(newtokens) example = Example(dataset="iitp_product_reviews_hi_sa", task="classification", split_type=mode, uid=len(examples), text=txt, label=label, text_pp=new_txt) examples.append(example) progress_bar(len(examples), len(lines), ["time"], [time() - st_time]) print( f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}" ) return examples
def perform_action(query): query = str(query).lower() display_map = {} kan_trans = Transliterator(source="kan", target="eng", build_lookup=True) hin_trans = Transliterator(source="hin", target="eng", build_lookup=True) query = transliterate(query, hin_trans, kan_trans) if get_lang_cs(query, lit) == "en_hi": intent = load_and_query_classifier(query) if get_lang_cs(query, lit) == "en_ka": intent = kannada_load_and_query_classifier(query) #print(intent) #print ("INTENT identified:"+" "+intent) ner_type = ["HOTEL", "RESTAURANT", "TRAVEL_BOOKING", "REMINDER"] # flag = 1 for kind in ner_type: if kind == intent: return ner_module.response(query, intent) if get_lang_cs(query, lit) == "en_hi": #print "This is a query of type:en_hi" word_map = preprocess_pipeline(query) elif get_lang_cs(query, lit) == "en_ka": #print "This is a query of type:en_ka" word_map = keyword_extract(query) if intent in ["SYMPTOMS", "TREATMENT", "PREVENTION"]: if query.find("corona") >= 0 or query.find("covid") >= 0 or query.find( "crna") >= 0: if intent == "SYMPTOMS": return covid19.symptoms elif intent == "TREATMENT": return covid19.treatment elif intent == "PREVENTION": return covid19.prevention #print(word_map) display_map["code_switch_type"] = get_lang_cs(query, lit) display_map["transliterated"] = query display_map["intent"] = intent return "Response:" + "\n" + str(display_map) + "\n" + google_search( query, word_map, intent)
def read_hinglishpedia_downloads(path1, path2, mode, standardizing_tags={}): st_time = time() global FIELDS, MAX_CHAR_LEN n_trimmed = 0 FIELDS += [ fieldname for fieldname in [ "tgt", ] if fieldname not in FIELDS ] Example = namedtuple(f"{mode}_example", FIELDS, defaults=(None, ) * len(FIELDS)) examples = [] from indictrans import Transliterator trn_hin2eng = Transliterator(source='hin', target='eng') txt_lines = [line.strip() for line in open(path1, "r")] tag_lines = [line.strip() for line in open(path2, "r")] for i, (txt, tags) in tqdm(enumerate(zip(txt_lines, tag_lines))): if not txt: continue txt = trn_hin2eng.transform(txt) example = Example(dataset="hinglishpedia", task="classification", split_type=mode, uid=len(examples), text=txt, langids=" ".join([ standardizing_tags[lid] if lid in standardizing_tags else "other" for lid in tags.split() ]), text_pp=txt) examples.append(example) # progress_bar(len(examples), len(txt_lines), ["time"], [time() - st_time]) print( f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}" ) return examples
def get_transliteration(vocab, headers): trans = {} if headers is None: trn = Transliterator(source='eng', target='hin', build_lookup=True) trans = {item: trn.transform(item) for item in vocab} else: base_url = 'https://api.cognitive.microsofttranslator.com' path = '/transliterate?api-version=3.0&language=hi&fromScript=Latn&toScript=Deva' count = 0 body = [] constructed_url = base_url + path query = '' while (count <= 6500): for i in range(count, (count + 500), 50): for j in range(i, i + 50): query += vocab[j] + ' ' body.append({'text': query.strip()}) query = '' response = requests.post(constructed_url, headers=headers, json=body) result = response.json() for j, i in enumerate(result): trans.update({body[j]['text']: i['text']}) body = [] count += 500 for i in range(count, len(vocab), 50): for j in range(i, i + 50): if j < len(vocab): query += vocab[j] + ' ' body.append({'text': query.strip()}) query = '' response = requests.post(constructed_url, headers=headers, json=body) result = response.json() for j, i in enumerate(result): trans.update({body[j]['text']: i['text']}) return trans
def __init__(self, lang='ta'): self.lang = lang self.normalizer = BaseNormalizer(lang) # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling # detect_lang_and_store in feature_utils.py self.lmap = self.load_language_maps( os.path.join(os.path.dirname(sys.path[0]), '../resources/data/alltextslang.txt')) self.soundexer = Soundex() self.ta_trans = Transliterator(source='eng', target='tam', build_lookup=True) self.ml_trans = Transliterator(source='eng', target='mal', build_lookup=True) self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.sym_spell.load_dictionary( '../../src/extern/data/etymdict.csv.vocab.tsv.gz', term_index=0, count_index=1, separator="\t") super().__init__()
def test_rtrans(self): """Test Indic-to-Indic ML and Rule-Based models.""" with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp: # first line contains language codes lang_codes = fp.readline().split() lang2word = dict( zip(lang_codes, [[] for i in range(len(lang_codes))])) for line in fp: line = line.split() for i, word in enumerate(line): lang2word[lang_codes[i]].append(word) for src in lang_codes: for trg in lang_codes: if src == trg: continue s2t_ml = Transliterator(source=src, target=trg, rb=False) s2t_rb = Transliterator(source=src, target=trg, rb=True) for word in lang2word[src]: s2t_ml.transform(word) s2t_rb.transform(word)
def start_lsh(): create = True sqlite_file = "/home/hkesavam/new_lid/code/gen_data/db_lid" conn = sqlite3.connect(sqlite_file) cur = conn.cursor() data = cur.execute("select distinct(soundex) from dev_table where count > 5") data = list(data) #soundex_dict = pickle.load(open("/home/hkesavam/new_lid/code/gen_data/data_dev/dev_to_soundex.pkl", "r")) #data = soundex_dict.keys() #data = ["e16512", "e16532", "hello", "hell"] lsh = MinHashLSH(threshold=0.5, num_perm=32)# Create MinHash objects minhashes = {} tot_wr_count = 0 if create: for c, i in enumerate(data): minhash = MinHash(num_perm=32) for d in i[0]: try: d = d.encode("utf-8") minhash.update(d) except: print "Enterin continue" continue lsh.insert(c, minhash) minhashes[c] = minhash print len(data) print("Dumping") #cPickle.dump(new_data, open("data", "wb"), -1) cPickle.dump(lsh, open("/home/hkesavam/new_lid/code/gen_data/lsh_model", "wb"), -1) print "Finished dumping" if not(create): data = cPickle.load( open("/home/hkesavam/new_lid/code/gen_data/data", "rb")) lsh = cPickle.load( open("/home/hkesavam/new_lid/code/gen_data/lsh_model", "rb")) hin_soundex_inst = Soundex() hin_trans = Transliterator(source='eng', target='hin', build_lookup=True) return cur, data, lsh, hin_soundex_inst, hin_trans
def test_rtrans(self): """Test Indic-to-Indic ML and Rule-Based models.""" with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp: # first line contains language codes lang_codes = fp.readline().split() lang2word = dict(zip(lang_codes, [[] for i in range(len(lang_codes))])) for line in fp: line = line.split() for i, word in enumerate(line): lang2word[lang_codes[i]].append(word) for src in lang_codes: for trg in lang_codes: if src == trg: continue s2t_ml = Transliterator(source=src, target=trg, rb=False) s2t_rb = Transliterator(source=src, target=trg, rb=True) for word in lang2word[src]: s2t_ml.transform(word) s2t_rb.transform(word)
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals import csv from indictrans import Transliterator trn = Transliterator(source='hin', target='eng', build_lookup=True) # print eng with open('dataSet.csv') as csvfile: reader = csv.reader(csvfile) for row in reader: hin = row[1].decode('utf-8').replace('\n', " ") eng = trn.transform(hin) print row[0], eng.encode('unicode-escape'), row[2]
from indictrans import Transliterator vocab = [] with open('all_roman.txt', 'r') as infile: con = infile.readlines() vocab = [x.strip('\n') for x in con] trn = Transliterator(source='eng', target='hin') with open('transliterations.txt', 'w+') as outfile: for word in vocab: deva = trn.transform(word) outfile.write(word + "\t" + deva + "\n")
class FeatureExtractor(BaseEstimator, TransformerMixin): """Extract review text, emojis and emoji sentiment. Takes a sequence of strings and produces a dict of values. Keys are `review`, `emojis`, and `emoji-sentiment`. """ def __init__(self, lang='ta'): self.lang = lang self.normalizer = BaseNormalizer(lang) # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling # detect_lang_and_store in feature_utils.py self.lmap = self.load_language_maps( os.path.join(os.path.dirname(sys.path[0]), '../resources/data/alltextslang.txt')) self.soundexer = Soundex() self.ta_trans = Transliterator(source='eng', target='tam', build_lookup=True) self.ml_trans = Transliterator(source='eng', target='mal', build_lookup=True) self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.sym_spell.load_dictionary( '../../src/extern/data/etymdict.csv.vocab.tsv.gz', term_index=0, count_index=1, separator="\t") super().__init__() def load_language_maps(self, mapfile): lmap = {} with open(mapfile, 'r') as mapf: for line in mapf: text, lang, conf = line.rstrip().split('\t') lmap[text] = (lang, float(conf)) return lmap def get_language_tag(self, text): return self.lmap.get(text, ('unknown', 0.0)) def fit(self, x, y=None): return self def transform(self, reviews): features = np.recarray( shape=(len(reviews), ), dtype=[ ('review', object), ('emojis', object), ('emoji_sentiment', object), ('lang_tag', object), ('len_range', object), ('soundexes', object), ], ) for i, review in enumerate(reviews): features['review'][i] = self.normalizer.normalize(text=review) emojis, sentiment = get_emojis_from_text(review) features['emojis'][i] = ' '.join(emojis) features['emoji_sentiment'][i] = sentiment lang, conf = self.get_language_tag(review.strip()) if lang == self.lang or lang == (self.lang + 'en'): # google agrees with some confidence agreement = 1 elif conf < 0.5: # google says not-tamil, but weakly agreement = 0.5 else: # google clearly says not-tamil agreement = 0 features['lang_tag'][i] = {'lang': lang, 'agreement': agreement} features['len_range'][i] = get_doc_len_range(review) if self.lang == 'ta': review_trans = self.ta_trans.transform(review) for word in review_trans.split(): suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) if len(suggestions) > 0 and suggestions[0].distance < 3: print(word, suggestions[0].term) # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity elif self.lang == 'ml': review_trans = self.ml_trans.transform(review) else: review_trans = review # TODO: introduce spell correct here for added normalisation # print(lang, review_trans) features['soundexes'][i] = ' '.join([ self.soundexer.soundex(word) for word in review_trans.split() ]) return features
from sklearn.metrics import classification_report from libindic.soundex import Soundex from lib.feature_utils import load_docs, get_emojis_from_text, get_doc_len_range sys.path.append( os.path.join(os.path.dirname(sys.path[0]), 'extern', 'indic_nlp_library')) from indicnlp.normalize.indic_normalize import BaseNormalizer try: from indictrans import Transliterator except ImportError: print( 'Please install indic-trans from git: https://github.com/libindic/indic-trans' ) ta_trans = Transliterator(source='eng', target='tam', build_lookup=True) ml_trans = Transliterator(source='eng', target='mal', build_lookup=True) # The maximum number of words to be used. (most frequent) MAX_NB_WORDS = 50000 # Max number of words in each review. MAX_SEQUENCE_LENGTH = 150 # This is fixed. EMBEDDING_DIM = 100 tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) soundexer = Soundex() def load_language_maps(mapfile): lmap = {}
# -*- coding: utf-8 -*- # @Author: claravania # @Date: 2018-10-15 11:03:20 # @Last Modified by: claravania # @Last Modified time: 2018-11-08 17:22:37 import codecs import os from indictrans import Transliterator ud_dir = '../data/ud-treebanks-v2.2' hi_tb = 'UD_Urdu-UDTB' trn = Transliterator(source='urd', target='eng') filenames = os.listdir(os.path.join(ud_dir, hi_tb)) for f in filenames: if f.endswith('.conllu') or f.endswith('.conllu.sample'): fname = os.path.join(ud_dir, hi_tb, f) print 'Reading ' + fname ftrn = fname + '.en' fout = codecs.open(ftrn, 'w', encoding='utf-8') count = 0 with codecs.open(fname, encoding='utf-8') as fin: for line in fin: count += 1
class Vida: """ Only focused on English to 15 Indic Language Transliteration Hindi (hin) Bengali (ben) Gujarati (guj) Punjabi (pun) Malayalam (mal) Kannada (kan) Tamil (tam) Telugu (tel) Oriya (ori) Marathi (mar) Assamese (ass) Konkani (kon) Bodo (bod) Nepali (nep) Urdu (urd) English (eng) """ languages = { "hin": "Hindi", "ben": "Bengali", "guj": "Gujarati", "pun": "Punjabi", "mal": "Malayalam", "kan": "Kannada", "tam": "Tamil", "tel": "Telugu", "ori": "Oriya", "mar": "Marathi", "ass": "Assamese", "kon": "Konkani", "bod": "Bodo", "nep": "Nepali", "urd": "Urdu", "eng": "English" } codes = languages.keys() @staticmethod def is_ascii(s): return all(ord(c) < 128 for c in s) def __init__(self, text, source, target): self.text = text self.source = source self.target = target self.isEngSource = (self.source == 'eng') if self.isEngSource: self.usdictionary = enchant.Dict("en_US") self.gbdictionary = enchant.Dict("en_GB") self.validated = self.source in self.codes and self.target in self.codes if self.validated: self.engine = Transliterator(source=self.source, target=self.target) def run(self): status = False message = "Couldn't transliterate the text." content = {} output = [] if not self.validated: message = "Please provide languages and their code." output = self.text else: text = self.text.split() try: for index in xrange(len(text)): word = text[index] if not self.isEngSource: word = word.decode('utf-8') output.insert( index, self.engine.transform(word).encode('utf-8')) else: if not Vida.is_ascii(word): word = word.decode('utf-8') if not self.usdictionary.check( word) and not self.gbdictionary.check(word): output.insert( index, self.engine.transform(word).encode('utf-8')) else: output.insert(index, word) status = True message = "Succesfully transliterated the code." except UnicodeDecodeError, e: Repo.exception(e) message = "Couldn't decode the language properly." except IndexError, e: Repo.exception(e) message = "Couldn't properly frame the sentence." output = ' '.join(output)
import io import re import sys import math import string import random import pickle from argparse import ArgumentParser from collections import Counter, defaultdict import dynet as dy import numpy as np from gensim.models.word2vec import KeyedVectors from indictrans import Transliterator trn = Transliterator(source='eng', target='hin', build_lookup=True) def is_lang_dist(dist_string): return ":" in dist_string def get_lang_dist(dist_string): dist = dict() pairs = dist_string.split(',') for p in pairs: lang, prob = p.split(':') dist[lang] = prob return dist
# If transliterated word is not a legitimate hindi word, add the closest hindi match to the predictions list if hindi_match: suggested_words.append([hindi_match[0], 1]) return suggested_words if __name__ == '__main__': # sample test script to test this program logic # initialize english and hindi dictionary object eng_dict = enchant.Dict('en_US') hin_dict = enchant.Dict('hi_IN') # initialize objects for language classifier and indictrans class trn = Transliterator(source='eng', target='hin') classifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) # take sample sentence from the user sentence = list(input().split()) # transliterate every word in the sentence for word in sentence: print(transliterate_in_hindi(trn, word, eng_dict, hin_dict, classifier, 0))
def main(): from indictrans import Transliterator trn = Transliterator(source='eng', target='tel', build_lookup=True) file = open("DATASET/datasetforfinal.txt", "r") file_data = file.readlines() print(len(file_data))
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals from indictrans import Transliterator from polyglot_tokenizer import Tokenizer flag = True s = 'hin' t = 'eng' forward_transl_full = Transliterator(source=s, target=t, build_lookup=True) forward_transl_token = Transliterator(source=s, target=t, decode='beamsearch') back_transl_token = Transliterator(source=t, target=s, build_lookup=True) tk = Tokenizer(lang=s[:2]) tk_back = Tokenizer(lang=t[:2]) l = u"रज्ज के रुलाया" #\nरज्ज के हंसाया\n\nमैंने दिल खो' के इश्क़ कमाया\n" l = l.lower().strip() lines = l.split("\n") print(lines) output = [] if flag == True: for l in lines: json = {}
def code_transliterate(self): trn = Transliterator(source='hin', target='eng', build_lookup=True) eng = trn.transform(self) return eng
def __init__(self): self.trn = Transliterator(source='hin', target='eng', decode='beamsearch', build_lookup=True) self.trans_dict = {}