Ejemplo n.º 1
0
def postprocess(infname,
                outfname,
                input_size,
                lang,
                common_lang="hi",
                transliterate=False):
    """
    parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.

    infname: fairseq log file
    outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
    input_size: expected number of output sentences
    lang: language
    """

    consolidated_testoutput = []
    # with open(infname,'r',encoding='utf-8') as infile:
    # consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) ))
    # consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1]))
    # consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ]

    consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)]
    temp_testoutput = []
    with open(infname, "r", encoding="utf-8") as infile:
        temp_testoutput = list(
            map(
                lambda x: x.strip().split("\t"),
                filter(lambda x: x.startswith("H-"), infile),
            ))
        temp_testoutput = list(
            map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]),
                temp_testoutput))
        for sid, score, hyp in temp_testoutput:
            consolidated_testoutput[sid] = (sid, score, hyp)
        consolidated_testoutput = [x[2] for x in consolidated_testoutput]

    if lang == "en":
        en_detok = MosesDetokenizer(lang="en")
        with open(outfname, "w", encoding="utf-8") as outfile:
            for sent in consolidated_testoutput:
                outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
    else:
        xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        with open(outfname, "w", encoding="utf-8") as outfile:
            for sent in consolidated_testoutput:
                if transliterate:
                    outstr = indic_detokenize.trivial_detokenize(
                        xliterator.transliterate(sent, common_lang, lang),
                        lang)
                else:
                    outstr = indic_detokenize.trivial_detokenize(sent, lang)
                outfile.write(outstr + "\n")
def postprocess(sents, lang, common_lang="hi"):
    """
    parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.

    infname: fairseq log file
    outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
    input_size: expected number of output sentences
    lang: language
    """
    postprocessed_sents = []

    if lang == "en":
        en_detok = MosesDetokenizer(lang="en")
        for sent in sents:
            # outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
            postprocessed_sents.append(en_detok.detokenize(sent.split(" ")))
    else:
        xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        for sent in sents:
            outstr = indic_detokenize.trivial_detokenize(
                xliterator.transliterate(sent, common_lang, lang), lang)
            # outfile.write(outstr + "\n")
            postprocessed_sents.append(outstr)
    postprocessed_sents = [i.replace("<unk>", "") for i in postprocessed_sents]
    return postprocessed_sents
Ejemplo n.º 3
0
def translate_mr(s):
    s=s.lower()
     #Apply tokenization
    tokenize = MosesTokenizer('en')
    s=' '.join(tokenize(s))
    with open('sentence.txt','w') as f:
        f.write(s)
    #Apply BPE
    !subword-nmt apply-bpe -c mr-bpe-codes.en < sentence.txt > bpe-sentence.txt
    #Translate using OpenNMT
    !onmt_translate -model mr-model.pt -src bpe-sentence.txt -output bpe-trans.txt -replace_unk -gpu 0
    #De-BPE
    !cat bpe-trans.txt | sed -E 's/(@@ )|(@@ ?$)//g' > trans.txt
    out = ""
    with open('trans.txt') as f:
        for i in f:
            out+=i
    return indic_detokenize.trivial_detokenize(out,lang='mr')
Ejemplo n.º 4
0
"""

# Commented out IPython magic to ensure Python compatibility.
# %%capture
# #Translating Test Data
# !subword-nmt apply-bpe -c mr-bpe-codes.en < processed-test-pmi.txt > bpe-processed-test-pmi.txt
# !onmt_translate -model mr-model.pt -src bpe-processed-test-pmi.txt -output bpe-trans-pmi.txt -replace_unk -gpu 0
# !cat bpe-trans-pmi.txt | sed -E 's/(@@ )|(@@ ?$)//g' > trans-pmi.txt

"""**Detokenize predicted translations**"""

from indicnlp.tokenize import indic_detokenize  
with open('trans-pmi.txt','r') as a,open('detok-trans-pmi.txt','w') as b:
    for item in a:
        b.write("%s" % indic_detokenize.trivial_detokenize(item,lang='mr'))

"""**Create a function to calculate BLEU score given hypothesis and reference files**"""

import sacrebleu

#Function to calculate BLEU score
def score(r,h):
	  ref = []
	  with open(r,'r') as lines:
		    for i in lines:
			      ref.append(i)
	  hypothesis = []
	  with open(h,'r') as lines:
		    for i in lines:
			      hypothesis.append(i)
Ejemplo n.º 5
0
def run_detokenize(args):
    for line in args.infile:
        args.outfile.write(indic_detokenize.trivial_detokenize(
            line, args.lang))
Ejemplo n.º 6
0
def detok(sentence, lang):
    if lang == "en":
        return MosesDetokenizer(lang="en").detokenize(sentence.split())
    elif lang == "ne":
        return indic_detokenize.trivial_detokenize(sentence, "ne")
Ejemplo n.º 7
0
from indicnlp.tokenize import indic_detokenize
indic_string = '" सुनो , कुछ आवाज़ आ रही है . " , उसने कहा । '

print('Input String: {}'.format(indic_string))
print('Detokenized String: {}'.format(
    indic_detokenize.trivial_detokenize(indic_string, lang='hi')))