def preprocess(self, sents, lang): """ Normalize, tokenize and script convert(for Indic) return number of sentences input file """ if lang == "en": # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")( # delayed(preprocess_line)(line, None, lang) for line in tqdm(sents, total=num_lines) # ) processed_sents = [ self.preprocess_sent(line, None, lang) for line in tqdm(sents) ] else: normfactory = indic_normalize.IndicNormalizerFactory() normalizer = normfactory.get_normalizer(lang) # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")( # delayed(preprocess_line)(line, normalizer, lang) for line in tqdm(infile, total=num_lines) # ) processed_sents = [ self.preprocess_sent(line, normalizer, lang) for line in tqdm(sents) ] return processed_sents
def _init_normalizers(self): normalizer_factory=indic_normalize.IndicNormalizerFactory() ## for languages with common parameters for lang in ['hi','mr','sa','kK','ne','sd','gu','ta','te','kn']: self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending) ## for languages with language specific parameters self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_canonicalize_addak=True, do_canonicalize_tippi=True, do_replace_vowel_bases=True) self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_remap_wa=True) self.normalizer_map['bn']=normalizer_factory.get_normalizer('bn', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_canonicalize_khanda_ta=True) self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_remap_assamese_chars=True,do_canonicalize_khanda_ta=True) self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_explicit_half_u=True,do_canonicalize_chillus=True, do_correct_geminated_T=True)
def _init_normalizers(self): normalizer_factory = indic_normalize.IndicNormalizerFactory() for lang in [ 'hi', 'mr', 'sa', 'kK', 'ne', 'sd', 'bn', 'gu', 'ta', 'te', 'kn', 'pa', 'or', 'as', 'ml' ]: self.normalizer_map[lang] = normalizer_factory.get_normalizer( lang, nasals_mode=self.nasals_mode)
def normalize_corpus_file(infname,outfname,lang): factory=indic_normalize.IndicNormalizerFactory() normalizer=factory.get_normalizer(lang) # DO normalization with codecs.open(infname,'r','utf-8') as ifile: with codecs.open(outfname,'w','utf-8') as ofile: for line in ifile.readlines(): normalized_line=normalizer.normalize(line) normalized_line=u' '.join([ c if len(c)==1 else u' '.join(c) for c in normalized_line.strip().split()]) ofile.write(normalized_line+u'\n')
def preprocess(infname, outfname, lang, transliterate=False): """ Normalize, tokenize and script convert(for Indic) return number of sentences input file """ n = 0 num_lines = sum(1 for line in open(infname, "r")) if lang == "en" or lang == "vi" or lang == "bg" or lang == "tr": with open(infname, "r", encoding="utf-8") as infile, open( outfname, "w", encoding="utf-8") as outfile: out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( delayed(preprocess_line)(line, None, lang) for line in tqdm(infile, total=num_lines)) for line in out_lines: outfile.write(line + "\n") n += 1 elif lang == "ar": with open(infname, "r", encoding="utf-8") as infile, open( outfname, "w", encoding="utf-8") as outfile: out_lines = [ clean_ar_text( text=line, remove_diacritics=True, segment=True, normalize=True, ) for line in tqdm(infile, total=num_lines) ] for line in out_lines: outfile.write(line + "\n") n += 1 else: normfactory = indic_normalize.IndicNormalizerFactory() normalizer = normfactory.get_normalizer(lang) # reading with open(infname, "r", encoding="utf-8") as infile, open( outfname, "w", encoding="utf-8") as outfile: out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( delayed(preprocess_line)(line, normalizer, lang, transliterate) for line in tqdm(infile, total=num_lines)) for line in out_lines: outfile.write(line + "\n") n += 1 return n
def _init_normalizers(self): normalizer_factory=indic_normalize.IndicNormalizerFactory() self.normalizer_map['hi']=normalizer_factory.get_normalizer('hi', nasals_mode=self.nasals_mode) self.normalizer_map['bn']=normalizer_factory.get_normalizer('bn', nasals_mode=self.nasals_mode) self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode) self.normalizer_map['gu']=normalizer_factory.get_normalizer('gu', nasals_mode=self.nasals_mode) self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode) self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode) self.normalizer_map['mr']=normalizer_factory.get_normalizer('mr', nasals_mode=self.nasals_mode) self.normalizer_map['ta']=normalizer_factory.get_normalizer('ta', nasals_mode=self.nasals_mode) self.normalizer_map['te']=normalizer_factory.get_normalizer('te', nasals_mode=self.nasals_mode) self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode) self.normalizer_map['kn']=normalizer_factory.get_normalizer('kn', nasals_mode=self.nasals_mode)
def run_normalize(args): # TODO: add more options to cli remove_nuktas = False normalize_nasals = 'do_nothing' # create normalizer factory = indic_normalize.IndicNormalizerFactory() normalizer = factory.get_normalizer(args.lang, remove_nuktas=remove_nuktas, nasals_mode=normalize_nasals) # DO normalization for line in args.infile: normalized_line = normalizer.normalize(line) args.outfile.write(normalized_line)
def _init_normalizers(self): normalizer_factory=indic_normalize.IndicNormalizerFactory() self.normalizer_map['hi']=normalizer_factory.get_normalizer('hi', nasals_mode=self.nasals_mode) self.normalizer_map['bn']=normalizer_factory.get_normalizer('bn', nasals_mode=self.nasals_mode) self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode, do_canonicalize_addak=True, do_canonicalize_tippi=True, do_replace_vowel_bases=True) self.normalizer_map['gu']=normalizer_factory.get_normalizer('gu', nasals_mode=self.nasals_mode) self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode, do_remap_wa=True) self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode, do_remap_assamese_chars=True) self.normalizer_map['mr']=normalizer_factory.get_normalizer('mr', nasals_mode=self.nasals_mode) self.normalizer_map['ta']=normalizer_factory.get_normalizer('ta', nasals_mode=self.nasals_mode) self.normalizer_map['te']=normalizer_factory.get_normalizer('te', nasals_mode=self.nasals_mode) self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode, do_canonicalize_chillus=True) self.normalizer_map['kn']=normalizer_factory.get_normalizer('kn', nasals_mode=self.nasals_mode)
def old_preprocess(infname, outfname, lang): """ Preparing each corpus file: - Normalization - Tokenization - Script coversion to Devanagari for Indic scripts """ n = 0 num_lines = sum(1 for line in open(infname, "r")) # reading with open(infname, "r", encoding="utf-8") as infile, open(outfname, "w", encoding="utf-8") as outfile: if lang == "en": en_tok = MosesTokenizer(lang="en") en_normalizer = MosesPunctNormalizer() for line in tqdm(infile, total=num_lines): outline = " ".join( en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)) outfile.write(outline + "\n") n += 1 else: normfactory = indic_normalize.IndicNormalizerFactory() normalizer = normfactory.get_normalizer(lang) for line in tqdm(infile, total=num_lines): outline = (unicode_transliterate.UnicodeIndicTransliterator. transliterate( " ".join( indic_tokenize.trivial_tokenize( normalizer.normalize(line.strip()), lang)), lang, "hi", ).replace(" ् ", "्")) outfile.write(outline + "\n") n += 1 return n
from indicnlp import loader from indicnlp.normalize import indic_normalize from indicnlp.transliterate import unicode_transliterate if __name__ == '__main__': """ This script transliterates Hindi to Kannada. It removes/remaps characters only found in Hindi. It also adds halanta to words ending with consonant - as is the convention in Kannada """ infname=sys.argv[1] # one sentence/word per line. Sentences should be space-tokenized outfname=sys.agv[2] loader.load() normalizer_factory=indic_normalize.IndicNormalizerFactory() normalizer=normalizer_factory.get_normalizer('hi') with open(infname,'r',encoding='utf-8') as infile, \ open(outfname,'w',encoding='utf-8') as outfile: for line in infile: line=line.strip() line=normalizer.normalize(line) ## replace chandrabindus with anusvara line=line.replace('\u0900','\u0902') line=line.replace('\u0901','\u0902') ### replace chandra e and o diacritics with e and o respectively #line=line.replace('\u0945','\u0947') #line=line.replace('\u0949','\u094b')