Ejemplo n.º 1
0
    def preprocess(self, sents, lang):
        """
        Normalize, tokenize and script convert(for Indic)
        return number of sentences input file

        """

        if lang == "en":

            # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")(
            #     delayed(preprocess_line)(line, None, lang) for line in tqdm(sents, total=num_lines)
            # )
            processed_sents = [
                self.preprocess_sent(line, None, lang) for line in tqdm(sents)
            ]

        else:
            normfactory = indic_normalize.IndicNormalizerFactory()
            normalizer = normfactory.get_normalizer(lang)

            # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")(
            #     delayed(preprocess_line)(line, normalizer, lang) for line in tqdm(infile, total=num_lines)
            # )
            processed_sents = [
                self.preprocess_sent(line, normalizer, lang) for line in tqdm(sents)
            ]

        return processed_sents
Ejemplo n.º 2
0
    def _init_normalizers(self):
        normalizer_factory=indic_normalize.IndicNormalizerFactory()

        ## for languages with common parameters
        for lang in ['hi','mr','sa','kK','ne','sd','gu','ta','te','kn']:
            self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode, 
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending)

        ## for languages with language specific parameters
        self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode, 
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_canonicalize_addak=True, do_canonicalize_tippi=True,
                    do_replace_vowel_bases=True)
        self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode, 
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_remap_wa=True)
        self.normalizer_map['bn']=normalizer_factory.get_normalizer('bn', nasals_mode=self.nasals_mode,
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_canonicalize_khanda_ta=True)
        self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode,
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_remap_assamese_chars=True,do_canonicalize_khanda_ta=True)
        self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode,
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_explicit_half_u=True,do_canonicalize_chillus=True, do_correct_geminated_T=True)
    def _init_normalizers(self):
        normalizer_factory = indic_normalize.IndicNormalizerFactory()

        for lang in [
                'hi', 'mr', 'sa', 'kK', 'ne', 'sd', 'bn', 'gu', 'ta', 'te',
                'kn', 'pa', 'or', 'as', 'ml'
        ]:
            self.normalizer_map[lang] = normalizer_factory.get_normalizer(
                lang, nasals_mode=self.nasals_mode)
Ejemplo n.º 4
0
def normalize_corpus_file(infname,outfname,lang):

    factory=indic_normalize.IndicNormalizerFactory()
    normalizer=factory.get_normalizer(lang)

    # DO normalization 
    with codecs.open(infname,'r','utf-8') as ifile:
        with codecs.open(outfname,'w','utf-8') as ofile:
            for line in ifile.readlines():
                normalized_line=normalizer.normalize(line)
                normalized_line=u' '.join([ c if len(c)==1 else u' '.join(c)  for c in normalized_line.strip().split()])
                ofile.write(normalized_line+u'\n')
Ejemplo n.º 5
0
def preprocess(infname, outfname, lang, transliterate=False):
    """
    Normalize, tokenize and script convert(for Indic)
    return number of sentences input file

    """

    n = 0
    num_lines = sum(1 for line in open(infname, "r"))
    if lang == "en" or lang == "vi" or lang == "bg" or lang == "tr":
        with open(infname, "r", encoding="utf-8") as infile, open(
                outfname, "w", encoding="utf-8") as outfile:

            out_lines = Parallel(n_jobs=-1, backend="multiprocessing")(
                delayed(preprocess_line)(line, None, lang)
                for line in tqdm(infile, total=num_lines))

            for line in out_lines:
                outfile.write(line + "\n")
                n += 1
    elif lang == "ar":
        with open(infname, "r", encoding="utf-8") as infile, open(
                outfname, "w", encoding="utf-8") as outfile:

            out_lines = [
                clean_ar_text(
                    text=line,
                    remove_diacritics=True,
                    segment=True,
                    normalize=True,
                ) for line in tqdm(infile, total=num_lines)
            ]

            for line in out_lines:
                outfile.write(line + "\n")
                n += 1
    else:
        normfactory = indic_normalize.IndicNormalizerFactory()
        normalizer = normfactory.get_normalizer(lang)
        # reading
        with open(infname, "r", encoding="utf-8") as infile, open(
                outfname, "w", encoding="utf-8") as outfile:

            out_lines = Parallel(n_jobs=-1, backend="multiprocessing")(
                delayed(preprocess_line)(line, normalizer, lang, transliterate)
                for line in tqdm(infile, total=num_lines))

            for line in out_lines:
                outfile.write(line + "\n")
                n += 1
    return n
Ejemplo n.º 6
0
    def _init_normalizers(self):
        normalizer_factory=indic_normalize.IndicNormalizerFactory()

        self.normalizer_map['hi']=normalizer_factory.get_normalizer('hi', nasals_mode=self.nasals_mode)
        self.normalizer_map['bn']=normalizer_factory.get_normalizer('bn', nasals_mode=self.nasals_mode)
        self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode)
        self.normalizer_map['gu']=normalizer_factory.get_normalizer('gu', nasals_mode=self.nasals_mode)
        self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode)
        self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode)
        self.normalizer_map['mr']=normalizer_factory.get_normalizer('mr', nasals_mode=self.nasals_mode)
        self.normalizer_map['ta']=normalizer_factory.get_normalizer('ta', nasals_mode=self.nasals_mode)
        self.normalizer_map['te']=normalizer_factory.get_normalizer('te', nasals_mode=self.nasals_mode)
        self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode)
        self.normalizer_map['kn']=normalizer_factory.get_normalizer('kn', nasals_mode=self.nasals_mode)
Ejemplo n.º 7
0
def run_normalize(args):

    # TODO: add more options to cli
    remove_nuktas = False
    normalize_nasals = 'do_nothing'

    # create normalizer
    factory = indic_normalize.IndicNormalizerFactory()
    normalizer = factory.get_normalizer(args.lang,
                                        remove_nuktas=remove_nuktas,
                                        nasals_mode=normalize_nasals)

    # DO normalization
    for line in args.infile:
        normalized_line = normalizer.normalize(line)
        args.outfile.write(normalized_line)
Ejemplo n.º 8
0
    def _init_normalizers(self):
        normalizer_factory=indic_normalize.IndicNormalizerFactory()

        self.normalizer_map['hi']=normalizer_factory.get_normalizer('hi', nasals_mode=self.nasals_mode)
        self.normalizer_map['bn']=normalizer_factory.get_normalizer('bn', nasals_mode=self.nasals_mode)
        self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode,
                                    do_canonicalize_addak=True, do_canonicalize_tippi=True,
                                    do_replace_vowel_bases=True)
        self.normalizer_map['gu']=normalizer_factory.get_normalizer('gu', nasals_mode=self.nasals_mode)
        self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode,
                                    do_remap_wa=True)
        self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode,
                                    do_remap_assamese_chars=True)
        self.normalizer_map['mr']=normalizer_factory.get_normalizer('mr', nasals_mode=self.nasals_mode)
        self.normalizer_map['ta']=normalizer_factory.get_normalizer('ta', nasals_mode=self.nasals_mode)
        self.normalizer_map['te']=normalizer_factory.get_normalizer('te', nasals_mode=self.nasals_mode)
        self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode,
                                                do_canonicalize_chillus=True)
        self.normalizer_map['kn']=normalizer_factory.get_normalizer('kn', nasals_mode=self.nasals_mode)
Ejemplo n.º 9
0
def old_preprocess(infname, outfname, lang):
    """
    Preparing each corpus file:
      - Normalization
      - Tokenization
      - Script coversion to Devanagari for Indic scripts
    """
    n = 0
    num_lines = sum(1 for line in open(infname, "r"))
    # reading
    with open(infname, "r",
              encoding="utf-8") as infile, open(outfname,
                                                "w",
                                                encoding="utf-8") as outfile:

        if lang == "en":
            en_tok = MosesTokenizer(lang="en")
            en_normalizer = MosesPunctNormalizer()
            for line in tqdm(infile, total=num_lines):
                outline = " ".join(
                    en_tok.tokenize(en_normalizer.normalize(line.strip()),
                                    escape=False))
                outfile.write(outline + "\n")
                n += 1

        else:
            normfactory = indic_normalize.IndicNormalizerFactory()
            normalizer = normfactory.get_normalizer(lang)
            for line in tqdm(infile, total=num_lines):
                outline = (unicode_transliterate.UnicodeIndicTransliterator.
                           transliterate(
                               " ".join(
                                   indic_tokenize.trivial_tokenize(
                                       normalizer.normalize(line.strip()),
                                       lang)),
                               lang,
                               "hi",
                           ).replace(" ् ", "्"))

                outfile.write(outline + "\n")
                n += 1
    return n
Ejemplo n.º 10
0
from indicnlp import loader
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate

if __name__ == '__main__': 
    """
    This script transliterates Hindi to Kannada. It removes/remaps 
    characters only found in Hindi. It also adds halanta to words ending
    with consonant - as is the convention in Kannada
    """

    infname=sys.argv[1]  # one sentence/word per line. Sentences should be space-tokenized
    outfname=sys.agv[2]
    loader.load()

    normalizer_factory=indic_normalize.IndicNormalizerFactory()
    normalizer=normalizer_factory.get_normalizer('hi')

    with open(infname,'r',encoding='utf-8') as infile, \
         open(outfname,'w',encoding='utf-8') as outfile:
        for line in infile: 
            line=line.strip()
            line=normalizer.normalize(line)
    
            ## replace chandrabindus with anusvara
            line=line.replace('\u0900','\u0902')
            line=line.replace('\u0901','\u0902')
    
            ### replace chandra e and o diacritics with e and o respectively
            #line=line.replace('\u0945','\u0947')
            #line=line.replace('\u0949','\u094b')