def main(): parser = argparse.ArgumentParser() parser.add_argument("--indic-nlp-path", required=True, help="path to Indic NLP Library root") parser.add_argument("--language", required=True) parser.add_argument("--remove-nuktas", default=False, action="store_true") parser.add_argument("input", help="input file; use - for stdin") args = parser.parse_args() try: sys.path.extend([ args.indic_nlp_path, os.path.join(args.indic_nlp_path, "src"), ]) from indicnlp.tokenize import indic_tokenize from indicnlp.normalize.indic_normalize import IndicNormalizerFactory except: raise Exception( "Cannot load Indic NLP Library, make sure --indic-nlp-path is correct" ) # create normalizer factory = IndicNormalizerFactory() normalizer = factory.get_normalizer( args.language, remove_nuktas=args.remove_nuktas, ) # normalize and tokenize for line in fileinput.input([args.input], openhook=fileinput.hook_compressed): line = normalizer.normalize(line.decode("utf-8", errors="ignore")) line = " ".join(indic_tokenize.trivial_tokenize(line, args.language)) sys.stdout.write(line.encode("utf-8"))
def normalize(ip_file_path, op_file_path, ln): with open(ip_file_path, 'r') as f: with open(op_file_path, "w") as text_file: for line in f: remove_nuktas = False factory = IndicNormalizerFactory() normalizer = factory.get_normalizer(ln) output_text = normalizer.normalize(line) text_file.write(output_text)
def extract_exclusive_msr_corpus(c0_dir, c1_dir, c0_lang, c1_lang, outdir ): factory=IndicNormalizerFactory() l0_normalizer=factory.get_normalizer(lang_code_mapping[c0_lang]) l1_normalizer=factory.get_normalizer(lang_code_mapping[c1_lang]) data_cache=defaultdict(lambda : [set(),set()]) # read corpus 0 en0_f=codecs.open(c0_dir+'/train.En','r','utf-8') l0_f=codecs.open(c0_dir+'/train.'+c0_lang,'r','utf-8') for en_l,c_l in itertools.izip(iter(en0_f),iter(l0_f)): data_cache[en_l.strip()][0].add(l0_normalizer.normalize(c_l.strip())) en0_f.close() l0_f.close() # read corpus 1 en1_f=codecs.open(c1_dir+'/train.En','r','utf-8') l1_f=codecs.open(c1_dir+'/train.'+c1_lang,'r','utf-8') for en_l,c_l in itertools.izip(iter(en1_f),iter(l1_f)): data_cache[en_l.strip()][1].add(l1_normalizer.normalize(c_l.strip())) en1_f.close() l1_f.close() # write the common data # from language en to c0 xor_f=codecs.open(outdir+'/train.{}-{}'.format('En',c0_lang),'w','utf-8') xor_list=[] for en_l, other_l_lists in data_cache.iteritems(): if (len(other_l_lists[0]) >0 and len(other_l_lists[1]) == 0): other_l_lists_w=[u''.join(x.split()) for x in other_l_lists[0]] xor_list.append(u''.join(en_l.split()) + u'|' + u'^'.join(other_l_lists_w)+u'\n') random.shuffle(xor_list) for wr in xor_list: xor_f.write(wr) xor_f.close() # from language en to c1 xor_f=codecs.open(outdir+'/train.{}-{}'.format('En',c1_lang),'w','utf-8') xor_list=[] for en_l, other_l_lists in data_cache.iteritems(): if (len(other_l_lists[0]) ==0 and len(other_l_lists[1]) > 0): other_l_lists_w=[u''.join(x.split()) for x in other_l_lists[1]] xor_list.append(u''.join(en_l.split()) + u'|' + u'^'.join(other_l_lists_w)+u'\n') random.shuffle(xor_list) for wr in xor_list: xor_f.write(wr) xor_f.close()
def pre_process_hindi_sentence(line): remove_nuktas = False factory = IndicNormalizerFactory() normalizer = factory.get_normalizer("hi", remove_nuktas) line = normalizer.normalize(line) line = clean_text(line) tokens = list() for t in indic_tokenize.trivial_tokenize(line): tokens.append(t) line = tokens line = [word.lower() for word in line] line = [word for word in line if not re.search(r'\d', word)] line = ' '.join(line) return (line)
def normalize_and_tokenize(self, lang, fname): factory = IndicNormalizerFactory() normalizer = factory.get_normalizer(lang, remove_nuktas=False) tokenized_file = fname.replace('/', '_') tokenized_file = os.path.join('/tmp', tokenized_file) with open(fname) as istream: with open(tokenized_file, 'w+') as ostream: for line in istream: line = line.strip() line = normalizer.normalize(line) tokens = tokenize(line, lang=self.src_lang) tokenized_line = ' '.join(tokens) print(tokenized_line, file=ostream) return tokenized_file
def __init__(self, dataset_path, index_path, cache_path): ''' Init class method Arguments: dataset_path - path to json data index_paths - dict that maps language tag to faiss index path ''' self.cache_path = cache_path self.lang_map = { 'HI': 'hi', 'BE': 'bn', 'GU': 'gu', 'OD': 'or', 'PU': 'pa', 'EN': 'en', 'MA': 'mr' } self.dataset = read_json_file(dataset_path) self.index_path = index_path print(self.index_path, self.cache_path) self.factory = IndicNormalizerFactory() self.stemmer = WordNetLemmatizer() self.normalizers = self.get_indic_normalizers() self.en_stop = set(nltk.corpus.stopwords.words('english')) # Dataset params self.phrases = list() self.targets = list() self.src_lang = list() self.target_lang = list() self.max_seq_length = 128 self.language_ids = { 'HI': 0, 'BE': 1, 'GU': 2, 'OD': 3, 'PU': 4, 'EN': 5, 'MA': 6 } self.get_dataset()
{'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3} ``` 3. ``` {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3} ``` This example is taken from [here](https://leimao.github.io/blog/Byte-Pair-Encoding/). **Create instances of normalizer and tokenizer for english and marathi** """ from mosestokenizer import * from indicnlp.tokenize import indic_tokenize from indicnlp.normalize.indic_normalize import IndicNormalizerFactory factory = IndicNormalizerFactory() normalizer = factory.get_normalizer("mr") tokenize = MosesTokenizer('en') """**Preprocessing functions for** > * English: Lowercase + Tokenize > * Marathi: Normalize + Tokenize """ def preprocess_en(text): s = text.lower() s = ' '.join(tokenize(s)) return s def preprocess_mr(text):
def process(lang, sent): normalizer = IndicNormalizerFactory().get_normalizer(lang) normalized = normalizer.normalize(sent) processed = ' '.join(trivial_tokenize(normalized, lang)) return processed
def __init__(self): data_dir = os.path.dirname(__file__) + '/data/' self.initial_urdu_to_hindi_map = {} self.final_urdu_to_hindi_map = {} self.urdu_to_hindi_map_pass1 = {} self.urdu_to_hindi_map_pass2 = {} for map_file in HINDUSTANI_MISC_MAP_FILES: df = pd.read_csv(data_dir + map_file, header=None) for i in df.columns: urdu_letter, roman_letter, hindi_letter = str( df[i][0]).strip(), str(df[i][1]).strip(), str( df[i][2]).strip() self.urdu_to_hindi_map_pass1[urdu_letter] = hindi_letter for map_file in INITIAL_HINDUSTANI_MAP_FILES: df = pd.read_csv(data_dir + map_file, header=None) for i in df.columns: urdu_letter, roman_letter, hindi_letter = str( df[i][0]).strip(), str(df[i][1]).strip(), str( df[i][2]).strip() self.initial_urdu_to_hindi_map[urdu_letter] = hindi_letter for map_file in FINAL_HINDUSTANI_MAP_FILES: df = pd.read_csv(data_dir + map_file, header=None) for i in df.columns: urdu_letter, roman_letter, hindi_letter = str( df[i][0]).strip(), str(df[i][1]).strip(), str( df[i][2]).strip() self.final_urdu_to_hindi_map[urdu_letter] = hindi_letter for map_file in HINDUSTANI_MAIN_MAP_FILES: df = pd.read_csv(data_dir + map_file, header=None) for i in df.columns: urdu_letter, roman_letter, hindi_letter = str( df[i][0]).strip(), str(df[i][1]).strip(), str( df[i][2]).strip() self.urdu_to_hindi_map_pass2[urdu_letter] = hindi_letter if 'consonants' not in map_file: continue # Non-initial forms: Consonant+ا to Consonant+ा self.urdu_to_hindi_map_pass2[urdu_letter + 'ا'] = hindi_letter + 'ा' if len(urdu_letter) == 1: urdu_shadda, hindi_shadda = urdu_letter + " ّ".strip( ), hindi_letter + '्' + hindi_letter self.urdu_to_hindi_map_pass1[urdu_shadda] = hindi_shadda self.urdu_to_hindi_map_pass1[urdu_shadda + 'ا'] = hindi_shadda + 'ा' # Note on why it's not in pass-2: پکّا is converted as पक्कअ instead of पक्का (Regex sees shadda char as word boundary?) self.initial_urdu_to_hindi_converter = StringTranslator( self.initial_urdu_to_hindi_map, match_initial_only=True) self.final_urdu_to_hindi_converter = StringTranslator( self.final_urdu_to_hindi_map, match_final_only=True) self.urdu_to_hindi_converter_pass1 = StringTranslator( self.urdu_to_hindi_map_pass1) self.urdu_to_hindi_converter_pass2 = StringTranslator( self.urdu_to_hindi_map_pass2) # Monkey patch: Force ह to map only to ہ (not ھ) self.urdu_to_hindi_converter_pass2.reverse_translation_dict['ह'] = 'ہ' self.urdu_to_hindi_converter_pass2.reverse_translation_dict[ 'ह' + 'ा'] = 'ہ' + 'ا' self.urdu_to_hindi_converter_pass1.reverse_translation_dict[ 'ह्ह'] = 'ہّ' self.urdu_to_hindi_converter_pass1.reverse_translation_dict[ 'ह्ह' + 'ा'] = 'ہّ' + 'ا' from indicnlp.normalize.indic_normalize import IndicNormalizerFactory self.hindi_normalizer = IndicNormalizerFactory().get_normalizer('hi')
def extract_common_msr_corpus(c0_dir, c1_dir, c0_lang, c1_lang, outdir ): factory=IndicNormalizerFactory() l0_normalizer=factory.get_normalizer(lang_code_mapping[c0_lang]) l1_normalizer=factory.get_normalizer(lang_code_mapping[c1_lang]) data_cache=defaultdict(lambda : [set(),set()]) # read corpus 0 en0_f=codecs.open(c0_dir+'/train.En','r','utf-8') l0_f=codecs.open(c0_dir+'/train.'+c0_lang,'r','utf-8') for en_l,c_l in itertools.izip(iter(en0_f),iter(l0_f)): data_cache[en_l.strip()][0].add(l0_normalizer.normalize(c_l.strip())) en0_f.close() l0_f.close() # read corpus 1 en1_f=codecs.open(c1_dir+'/train.En','r','utf-8') l1_f=codecs.open(c1_dir+'/train.'+c1_lang,'r','utf-8') for en_l,c_l in itertools.izip(iter(en1_f),iter(l1_f)): data_cache[en_l.strip()][1].add(l1_normalizer.normalize(c_l.strip())) en1_f.close() l1_f.close() # write the common data # from language c0 to c1 cc0_1_f=codecs.open(outdir+'/train.{}-{}'.format(c0_lang,c1_lang),'w','utf-8') cc0_1_xlit_f=codecs.open(outdir+'/train.{}-{}.xlit'.format(c0_lang,c1_lang),'w','utf-8') cc0_1_list=[] cc0_1_xlit_list=[] for en_l, other_l_lists in data_cache.iteritems(): if len(other_l_lists[0]) >0 and len(other_l_lists[1]) >0 : for c0_str in other_l_lists[0] : c0_str_w=c0_str.replace(u' ',u'') other_l_lists_w=[u''.join(x.split()) for x in other_l_lists[1]] if len(c0_str_w)>3: cc0_1_list.append(c0_str_w + u'|' + u'^'.join(other_l_lists_w)+u'\n') cc0_1_xlit_list.append(UnicodeIndicTransliterator.transliterate(c0_str_w,lang_code_mapping[c0_lang],'hi') + u'|' + u'^'.join([ UnicodeIndicTransliterator.transliterate(x,lang_code_mapping[c1_lang],'hi') for x in other_l_lists_w])+u'\n') combined_list=zip(cc0_1_list,cc0_1_xlit_list) random.shuffle(combined_list) for wr,wr_xlit in combined_list: cc0_1_f.write(wr) cc0_1_xlit_f.write(wr_xlit) cc0_1_f.close() cc0_1_xlit_f.close() # from language c1 to c0 cc1_0_f=codecs.open(outdir+'/train.{}-{}'.format(c1_lang,c0_lang),'w','utf-8') cc1_0_xlit_f=codecs.open(outdir+'/train.{}-{}.xlit'.format(c1_lang,c0_lang),'w','utf-8') cc1_0_list=[] cc1_0_xlit_list=[] for en_l, other_l_lists in data_cache.iteritems(): if len(other_l_lists[1]) >0 and len(other_l_lists[0]) >0 : for c1_str in other_l_lists[1] : c1_str_w=c1_str.replace(u' ',u'') other_l_lists_w=[u''.join(x.split()) for x in other_l_lists[0]] if len(c1_str_w)>3: cc1_0_list.append(c1_str_w + u'|' + u'^'.join(other_l_lists_w)+u'\n') cc1_0_xlit_list.append(UnicodeIndicTransliterator.transliterate(c1_str_w,lang_code_mapping[c1_lang],'hi') + u'|' + u'^'.join([ UnicodeIndicTransliterator.transliterate(x,lang_code_mapping[c0_lang],'hi') for x in other_l_lists_w])+u'\n') combined_list=zip(cc1_0_list,cc1_0_xlit_list) random.shuffle(combined_list) for wr,wr_xlit in combined_list: cc1_0_f.write(wr) cc1_0_xlit_f.write(wr_xlit) cc1_0_f.close() cc1_0_xlit_f.close()
import pickle from indicnlp.normalize.indic_normalize import IndicNormalizerFactory from indicnlp.tokenize import indic_tokenize remove_nuktas = False factory = IndicNormalizerFactory() normalizer = factory.get_normalizer("hi", remove_nuktas) hin = open('./model/dataset/en-hi.hi').readlines() hin = [line.decode('UTF-8') for line in hin] print(hin[:5]) hin = [normalizer.normalize(line.strip()) for line in hin] hin = [indic_tokenize.trivial_tokenize(line) for line in hin] print(hin[:5]) with open("hindi_tokens.txt", "wb") as fp: pickle.dump(hin, fp)
# In[ ]: for path in sys.path: print (path) # In[ ]: from indicnlp.normalize.indic_normalize import IndicNormalizerFactory input_text="\u0958 \u0915\u093c" remove_nuktas=False factory=IndicNormalizerFactory() normalizer=factory.get_normalizer("hi",remove_nuktas) output_text=normalizer.normalize(input_text) print(output_text) print('Length before normalization: {}'.format(len(input_text))) print('Length after normalization: {}'.format(len(output_text))) # In[ ]: from indicnlp.normalize.indic_normalize import DevanagariNormalizer input_text = "अत : इसे बिना टाँके वाला ऑपरेशन भी कहते हैं ।" factory1=DevanagariNormalizer() #normalizer1=factory1.get_normalizer("hi",remove_nuktas)
import sentencepiece from sacremoses import MosesDetokenizer, MosesTokenizer import sys, os sys.path.extend([ "app/modules/indic_nlp_library/src", ]) # coming all the way from app.py from indicnlp.tokenize import indic_tokenize, indic_detokenize from indicnlp.normalize.indic_normalize import IndicNormalizerFactory factory = IndicNormalizerFactory() normalizer = factory.get_normalizer( "ne", remove_nuktas=False, ) def bpencode(sentence, srctolang): sp = sentencepiece.SentencePieceProcessor() if srctolang == "ne_en": sp.Load("app/ne_en_bpe20000/sentencepiece.bpe.model") elif srctolang == "en_ne": sp.Load("app/en_ne_bpe5000/sentencepiece.bpe.model") return " ".join(sp.EncodeAsPieces(sentence)) def detok(sentence, lang): if lang == "en": return MosesDetokenizer(lang="en").detokenize(sentence.split())
def get_split_algo(lang: str, split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]: # get default algorithm if requested if split_algo == "default": # use best algorithm in function of language if lang in LANGS_MOSES: split_algo = "moses" elif lang in LANGS_INDIC: split_algo = "indic" elif lang in LANGS_GEEZ: split_algo = "geez" elif lang in LANGS_KHMER: split_algo = "khmer" elif lang in LANGS_BURMESE: split_algo = "burmese" else: # use Moses by default (which likely will fall-back to English) split_algo = "moses" logger.info(f" - default algorithm for {lang} is {split_algo}") if split_algo == "none" or lang == "TODO": logger.info(" - no sentence splitting") return lambda line: [line] elif split_algo == "moses": if lang in LANGS_MOSES: lang = LANGS_MOSES[lang] logger.info( f" - Moses sentence splitter: using rules for '{lang}'") else: lang = "en" logger.info( f" - Moses sentence splitter for {lang}: falling back to {lang} rules" ) splitter = SentenceSplitter(language=lang) # non_breaking_prefix_file=non_breaking_prefix_file return splitter.split elif split_algo == "indic": # initialize toolkit (apparently not needed for sentence segmentation) if INDIC_NLP_RESOURCES: logger.info(" - Initialize Indic NLP toolkit") indic_common.set_resources_path(INDIC_NLP_RESOURCES) indic_loader.load() if lang in LANGS_INDIC: lang = LANGS_INDIC[lang] logger.info( f" - Indic sentence splitter: using rules for '{lang}'") else: lang = "hi" logger.info( f" - Indic sentence splitter for {lang}: falling back to {lang} rules" ) # setup normalizer factory = IndicNormalizerFactory() indic_normalizer = factory.get_normalizer(lang) def split_indic(line: str) -> tp.Iterable[str]: """Split Indian text into sentences using Indic NLP tool.""" line = indic_normalizer.normalize(line) for sent in indic_sent_tok.sentence_split(line, lang=lang): yield sent return split_indic elif split_algo == "laonlp": logger.info(f" - LaoNLP sentence splitter applied to '{lang}'") return lao_sent_tok elif split_algo == "khmer": logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'") return khm_sent_tok elif split_algo == "bodnlp": logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'") return bod_sent_tok elif split_algo == "geez": logger.info( f" - Ge'ez rule-based sentence splitter applied to '{lang}'") return split_geez elif split_algo == "burmese": logger.info( f" - Burmese rule-based sentence splitter applied to '{lang}'") return split_burmese else: logger.error(f"Unknown splitting algorithm {split_algo}") return None
# -*- coding: utf-8 -*- # The path to the local git repo for Indic NLP library INDIC_NLP_LIB_HOME="/Users/Avijit/Documents/nlp_lib" # The path to the local git repo for Indic NLP Resources INDIC_NLP_RESOURCES="/Users/Avijit/Documents/nlp_res" from indicnlp import common common.set_resources_path(INDIC_NLP_RESOURCES) from indicnlp import loader loader.load() from indicnlp.normalize.indic_normalize import IndicNormalizerFactory input_text=u"\u0958 \u0915\u093c" remove_nuktas=False factory=IndicNormalizerFactory() normalizer=factory.get_normalizer("hi",remove_nuktas) output_text=normalizer.normalize(input_text) print output_text print 'Length before normalization: {}'.format(len(input_text)) print 'Length after normalization: {}'.format(len(output_text))
class XLingualTrainDataset_baseline_lstm(Dataset): ''' Reverse dictionary data loader for training ''' def __init__(self, dataset_path, index_path, cache_path): ''' Init class method Arguments: dataset_path - path to json data index_paths - dict that maps language tag to faiss index path ''' self.cache_path = cache_path self.lang_map = { 'HI': 'hi', 'BE': 'bn', 'GU': 'gu', 'OD': 'or', 'PU': 'pa', 'EN': 'en', 'MA': 'mr' } self.dataset = read_json_file(dataset_path) self.index_path = index_path print(self.index_path, self.cache_path) self.factory = IndicNormalizerFactory() self.stemmer = WordNetLemmatizer() self.normalizers = self.get_indic_normalizers() self.en_stop = set(nltk.corpus.stopwords.words('english')) # Dataset params self.phrases = list() self.targets = list() self.src_lang = list() self.target_lang = list() self.max_seq_length = 128 self.language_ids = { 'HI': 0, 'BE': 1, 'GU': 2, 'OD': 3, 'PU': 4, 'EN': 5, 'MA': 6 } self.get_dataset() def get_indic_normalizers(self): ''' Get indic nlp normalizers for preprocessing data ''' normalizers = {} for lang in self.lang_map: if self.lang_map[lang] != "en": normalizers[self.lang_map[lang]] = self.factory.get_normalizer( self.lang_map[lang], remove_nuktas=False) return normalizers def get_dataset(self): self.embeddings = vocab.Vectors(name=self.index_path, cache=self.cache_path) self.vocabulary = torchtext.data.Field() # Adding pad and unk token self.embeddings.stoi[self.vocabulary.pad_token] = len( self.embeddings.stoi) self.embeddings.vectors[self.embeddings.stoi[ self.vocabulary.pad_token]] = torch.zeros(300) self.embeddings.stoi[self.vocabulary.unk_token] = len( self.embeddings.stoi) self.embeddings.vectors[self.embeddings.stoi[ self.vocabulary.unk_token]] = torch.zeros(300) for lang in ['en', 'hi', 'gu', 'pa', 'or', 'mr', 'bn']: for d in self.dataset: if self.lang_map[d["Target_ID"]] == lang: try: # Remove unknown tokens self.targets.append(self.embeddings.vectors[ self.embeddings.stoi[d["Target_keyword"]]]) self.src_lang.append(self.lang_map[d["Source_ID"]]) self.target_lang.append(self.lang_map[d["Target_ID"]]) self.phrases.append(d["Source_text"]) except KeyError: #print(d["Target_keyword"] + " not found") pass def en_tokenizer(self, document): ''' Borrowed preprocessing script from https://stackabuse.com/python-for-nlp-working-with-facebook-fasttext-library/ ''' # Remove all the special characters document = re.sub(r'\W', ' ', str(document)) # remove all single characters document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Remove single characters from the start document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Removing prefixed 'b' document = re.sub(r'^b\s+', '', document) # Converting to Lowercase document = document.lower() # Lemmatization tokens = document.split() tokens = [self.stemmer.lemmatize(word) for word in tokens] tokens = [word for word in tokens if word not in self.en_stop] tokens = [word for word in tokens if len(word) > 3] return tokens def indic_tokenizer(self, text, lang): ''' Tokenizer for indic nlp ''' # Tokenize tokens = indic_tokenize.trivial_tokenize(text=text, lang=lang) # Normalize for i in range(len(tokens)): tokens[i] = self.normalizers[lang].normalize(tokens[i]) return tokens def preprocessing_data(self, idx, src=True): tokens = [] if src: if self.src_lang[idx] != "en": tokens = self.indic_tokenizer(self.phrases[idx], self.src_lang[idx]) else: tokens = self.en_tokenizer(self.phrases[idx]) t_length = len(tokens) if t_length < self.max_seq_length: pad_token_length = self.max_seq_length - t_length tokens.extend([self.vocabulary.pad_token] * pad_token_length) else: tokens = tokens[:self.max_seq_length] return tokens def tokens2tensor(self, tokens): ''' Convert tokens to integer tensors ''' input_id_vector = [] for t in tokens: if self.embeddings.stoi.get(t) is None: input_id_vector.append( self.embeddings.stoi[self.vocabulary.unk_token]) else: input_id_vector.append(self.embeddings.stoi[t]) input_id_vector = torch.tensor(input_id_vector) return input_id_vector def __getitem__(self, idx): ''' Get item function pytorch Arguments: idx - text index ''' tokens = self.preprocessing_data(idx, src=True) input_idx = self.tokens2tensor(tokens) #target = torch.tensor(self.targets[idx]) target = (self.targets[idx]) label = torch.ones(target.shape[0], 1) return { "phrase": { 'input_ids': input_idx.squeeze(), }, "target": target, "label": label } def __len__(self): ''' Returns length of dataset ''' return len(self.phrases)
def parse_news_2015(infname, outdir, prefix, src_lang, tgt_lang): """ infname: input XML file outdir: output dir prefix: 'test', or 'train' src_lang tgt_lang """ if not os.path.exists(outdir): os.mkdir(outdir) # create normalizer factory = IndicNormalizerFactory() normalizer = factory.get_normalizer( lang_code_mapping[tgt_lang] if tgt_lang in lang_code_mapping else tgt_lang, False) # parser tree = ET.parse(infname) root = tree.getroot() # open files srcfile = codecs.open(outdir + '/{}.{}'.format(prefix, src_lang), 'w', 'utf-8') tgtfile = codecs.open(outdir + '/{}.{}'.format(prefix, tgt_lang), 'w', 'utf-8') idfile = codecs.open(outdir + '/{}.{}'.format(prefix, 'id'), 'w', 'utf-8') # stats pairs = 0 chars_src = 0 chars_org = 0 chars_norm = 0 for name in root: srcnode = name.find('SourceName') name_id = name.attrib['ID'] src_text = srcnode.text src_words = src_text.split(' ') children = None if prefix == 'train': ## use for training corpus children = name.findall('TargetName') else: # use for test corpus children = [name.find('TargetName')] for tgtnode in children: tgt_id = tgtnode.attrib['ID'] tgt_text = tgtnode.text tgt_words = tgt_text.split(' ') # if an input entry contains multiple words # Case 1: generate one line per word #if len(src_words)==len(tgt_words): # for offsetno, (src_word,tgt_word) in enumerate(zip(src_words,tgt_words)): # srcfile.write(u' '.join(src_word)+'\n') # tgtfile.write(u' '.join(tgt_word)+'\n') # idfile.write('{}_{}_{}\n'.format(name_id,tgt_id,offsetno)) # pairs+=1 # chars_src+=len(src_word) # chars_org+=len(tgt_word) # if tgt_lang in lang_code_mapping: # tgt_word=normalizer.normalize(tgt_word) # chars_norm+=len(tgt_word) # Case 2: generate just a single word srcfile.write(u' _ '.join( [u' '.join(src_word.upper()) for src_word in src_words]) + '\n') tgtfile.write(u' _ '.join( [u' '.join(tgt_word.upper()) for tgt_word in tgt_words]) + '\n') idfile.write('{}_{}_{}\n'.format(name_id, tgt_id, 0)) print '{}|{}|{}|{}|{}|{}|{}'.format(prefix, src_lang, tgt_lang, pairs, chars_src, chars_org, chars_norm) srcfile.close() tgtfile.close() idfile.close()
import argparse import os import sox from tqdm import tqdm from joblib import Parallel, delayed from glob import glob import string from indicnlp.tokenize.indic_tokenize import trivial_tokenize from indicnlp.normalize.indic_normalize import IndicNormalizerFactory lang = 'hi' # because bhojpuri and hindi both use devanagri script, so we are using devanagri normaliser normalizer = IndicNormalizerFactory().get_normalizer(lang) pattern_to_remove = '["0-9१-९"\'Z¤ªŸ॰⁄☺]+' def get_clean_lines(line): ''' Returns line if no foreign character other than pattern is present else returns empty string ''' line = line.strip() line = re.sub( '[%s]' % re.escape("!\"#$%&\()\'*+,-./:;<=>?@[\\]^_`{|}~‘’“\"ः"), '', line) if re.search(pattern_to_remove, line): return ''
def extract_common_msr_corpus(c0_dir, c1_dir, c0_lang, c1_lang, outdir): factory = IndicNormalizerFactory() l0_normalizer = factory.get_normalizer(lang_code_mapping[c0_lang]) l1_normalizer = factory.get_normalizer(lang_code_mapping[c1_lang]) data_cache = defaultdict(lambda: [set(), set()]) # read corpus 0 en0_f = codecs.open(c0_dir + '/train.En', 'r', 'utf-8') l0_f = codecs.open(c0_dir + '/train.' + c0_lang, 'r', 'utf-8') for en_l, c_l in itertools.izip(iter(en0_f), iter(l0_f)): data_cache[en_l.strip()][0].add(l0_normalizer.normalize(c_l.strip())) en0_f.close() l0_f.close() # read corpus 1 en1_f = codecs.open(c1_dir + '/train.En', 'r', 'utf-8') l1_f = codecs.open(c1_dir + '/train.' + c1_lang, 'r', 'utf-8') for en_l, c_l in itertools.izip(iter(en1_f), iter(l1_f)): data_cache[en_l.strip()][1].add(l1_normalizer.normalize(c_l.strip())) en1_f.close() l1_f.close() # write the common data # from language c0 to c1 cc0_1_f = codecs.open(outdir + '/train.{}-{}'.format(c0_lang, c1_lang), 'w', 'utf-8') cc0_1_xlit_f = codecs.open( outdir + '/train.{}-{}.xlit'.format(c0_lang, c1_lang), 'w', 'utf-8') cc0_1_list = [] cc0_1_xlit_list = [] for en_l, other_l_lists in data_cache.iteritems(): if len(other_l_lists[0]) > 0 and len(other_l_lists[1]) > 0: for c0_str in other_l_lists[0]: c0_str_w = c0_str.replace(u' ', u'') other_l_lists_w = [ u''.join(x.split()) for x in other_l_lists[1] ] if len(c0_str_w) > 3: cc0_1_list.append(c0_str_w + u'|' + u'^'.join(other_l_lists_w) + u'\n') cc0_1_xlit_list.append( UnicodeIndicTransliterator.transliterate( c0_str_w, lang_code_mapping[c0_lang], 'hi') + u'|' + u'^'.join([ UnicodeIndicTransliterator.transliterate( x, lang_code_mapping[c1_lang], 'hi') for x in other_l_lists_w ]) + u'\n') combined_list = zip(cc0_1_list, cc0_1_xlit_list) random.shuffle(combined_list) for wr, wr_xlit in combined_list: cc0_1_f.write(wr) cc0_1_xlit_f.write(wr_xlit) cc0_1_f.close() cc0_1_xlit_f.close() # from language c1 to c0 cc1_0_f = codecs.open(outdir + '/train.{}-{}'.format(c1_lang, c0_lang), 'w', 'utf-8') cc1_0_xlit_f = codecs.open( outdir + '/train.{}-{}.xlit'.format(c1_lang, c0_lang), 'w', 'utf-8') cc1_0_list = [] cc1_0_xlit_list = [] for en_l, other_l_lists in data_cache.iteritems(): if len(other_l_lists[1]) > 0 and len(other_l_lists[0]) > 0: for c1_str in other_l_lists[1]: c1_str_w = c1_str.replace(u' ', u'') other_l_lists_w = [ u''.join(x.split()) for x in other_l_lists[0] ] if len(c1_str_w) > 3: cc1_0_list.append(c1_str_w + u'|' + u'^'.join(other_l_lists_w) + u'\n') cc1_0_xlit_list.append( UnicodeIndicTransliterator.transliterate( c1_str_w, lang_code_mapping[c1_lang], 'hi') + u'|' + u'^'.join([ UnicodeIndicTransliterator.transliterate( x, lang_code_mapping[c0_lang], 'hi') for x in other_l_lists_w ]) + u'\n') combined_list = zip(cc1_0_list, cc1_0_xlit_list) random.shuffle(combined_list) for wr, wr_xlit in combined_list: cc1_0_f.write(wr) cc1_0_xlit_f.write(wr_xlit) cc1_0_f.close() cc1_0_xlit_f.close()
def parse_news_2015(infname, outdir, prefix, src_lang, tgt_lang): """ infname: input XML file outdir: output dir prefix: 'test', or 'train' src_lang tgt_lang """ if not os.path.exists(outdir): os.mkdir(outdir) # create normalizer factory=IndicNormalizerFactory() normalizer=factory.get_normalizer( lang_code_mapping[tgt_lang] if tgt_lang in lang_code_mapping else tgt_lang ,False) # parser tree = ET.parse(infname) root = tree.getroot() # open files srcfile=codecs.open(outdir+'/{}.{}'.format(prefix,src_lang),'w','utf-8') tgtfile=codecs.open(outdir+'/{}.{}'.format(prefix,tgt_lang),'w','utf-8') idfile=codecs.open(outdir+'/{}.{}'.format(prefix,'id'),'w','utf-8') # stats pairs=0 chars_src=0 chars_org=0 chars_norm=0 for name in root: srcnode=name.find('SourceName') name_id=name.attrib['ID'] src_text=srcnode.text src_words=src_text.split(' ') children=None if prefix=='train': ## use for training corpus children=name.findall('TargetName') else: # use for test corpus children=[name.find('TargetName')] for tgtnode in children: tgt_id=tgtnode.attrib['ID'] tgt_text=tgtnode.text tgt_words=tgt_text.split(' ') # if an input entry contains multiple words # Case 1: generate one line per word #if len(src_words)==len(tgt_words): # for offsetno, (src_word,tgt_word) in enumerate(zip(src_words,tgt_words)): # srcfile.write(u' '.join(src_word)+'\n') # tgtfile.write(u' '.join(tgt_word)+'\n') # idfile.write('{}_{}_{}\n'.format(name_id,tgt_id,offsetno)) # pairs+=1 # chars_src+=len(src_word) # chars_org+=len(tgt_word) # if tgt_lang in lang_code_mapping: # tgt_word=normalizer.normalize(tgt_word) # chars_norm+=len(tgt_word) # Case 2: generate just a single word srcfile.write( u' _ '.join([ u' '.join(src_word.upper()) for src_word in src_words ]) +'\n') tgtfile.write( u' _ '.join([ u' '.join(tgt_word.upper()) for tgt_word in tgt_words ]) +'\n') idfile.write('{}_{}_{}\n'.format(name_id,tgt_id,0)) print '{}|{}|{}|{}|{}|{}|{}'.format(prefix,src_lang,tgt_lang, pairs,chars_src,chars_org,chars_norm) srcfile.close() tgtfile.close() idfile.close()
def extract_exclusive_msr_corpus(c0_dir, c1_dir, c0_lang, c1_lang, outdir): factory = IndicNormalizerFactory() l0_normalizer = factory.get_normalizer(lang_code_mapping[c0_lang]) l1_normalizer = factory.get_normalizer(lang_code_mapping[c1_lang]) data_cache = defaultdict(lambda: [set(), set()]) # read corpus 0 en0_f = codecs.open(c0_dir + '/train.En', 'r', 'utf-8') l0_f = codecs.open(c0_dir + '/train.' + c0_lang, 'r', 'utf-8') for en_l, c_l in itertools.izip(iter(en0_f), iter(l0_f)): data_cache[en_l.strip()][0].add(l0_normalizer.normalize(c_l.strip())) en0_f.close() l0_f.close() # read corpus 1 en1_f = codecs.open(c1_dir + '/train.En', 'r', 'utf-8') l1_f = codecs.open(c1_dir + '/train.' + c1_lang, 'r', 'utf-8') for en_l, c_l in itertools.izip(iter(en1_f), iter(l1_f)): data_cache[en_l.strip()][1].add(l1_normalizer.normalize(c_l.strip())) en1_f.close() l1_f.close() # write the common data # from language en to c0 xor_f = codecs.open(outdir + '/train.{}-{}'.format('En', c0_lang), 'w', 'utf-8') xor_list = [] for en_l, other_l_lists in data_cache.iteritems(): if (len(other_l_lists[0]) > 0 and len(other_l_lists[1]) == 0): other_l_lists_w = [u''.join(x.split()) for x in other_l_lists[0]] xor_list.append(u''.join(en_l.split()) + u'|' + u'^'.join(other_l_lists_w) + u'\n') random.shuffle(xor_list) for wr in xor_list: xor_f.write(wr) xor_f.close() # from language en to c1 xor_f = codecs.open(outdir + '/train.{}-{}'.format('En', c1_lang), 'w', 'utf-8') xor_list = [] for en_l, other_l_lists in data_cache.iteritems(): if (len(other_l_lists[0]) == 0 and len(other_l_lists[1]) > 0): other_l_lists_w = [u''.join(x.split()) for x in other_l_lists[1]] xor_list.append(u''.join(en_l.split()) + u'|' + u'^'.join(other_l_lists_w) + u'\n') random.shuffle(xor_list) for wr in xor_list: xor_f.write(wr) xor_f.close()
#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # Use: echo {text} | python tokenize_indic.py {language} import sys from indicnlp.tokenize.indic_tokenize import trivial_tokenize from indicnlp.normalize.indic_normalize import IndicNormalizerFactory factory = IndicNormalizerFactory() normalizer = factory.get_normalizer(sys.argv[1], remove_nuktas=False, nasals_mode='do_nothing') for line in sys.stdin: normalized_line = normalizer.normalize(line.strip()) tokenized_line = ' '.join(trivial_tokenize(normalized_line, sys.argv[1])) print(tokenized_line)
import pandas as pd import numpy as np import glob import Levenshtein as Lev from tqdm import tqdm import swifter import argparse from indicnlp.tokenize.indic_tokenize import trivial_tokenize from indicnlp.normalize.indic_normalize import IndicNormalizerFactory lang = 'hi' normalizer_factory = IndicNormalizerFactory() normalizer = normalizer_factory.get_normalizer(lang) def wer(s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()]