Beispiel #1
0
def encode_itranslate_decode(i, num_map, tp_tokenizer, num_hypotheses=3):
    try:
        log_info("Inside encode_itranslate_decode function", MODULE_CONTEXT)
        model_path, sp_encoder, sp_decoder = get_model_path(i['id'])
        translator = load_models.loaded_models[i['id']]
        i['src'] = str(sp.encode_line(sp_encoder, i['src']))
        i_final = format_converter(i['src'])

        if 'target_prefix' in i and len(
                i['target_prefix']) > 0 and i['target_prefix'].isspace(
                ) == False:
            log_info("target prefix: {}".format(i['target_prefix']),
                     MODULE_CONTEXT)
            i['target_prefix'] = misc.convert_digits_preprocess(
                i['tgt_lang'], i['target_prefix'])
            i['target_prefix'] = replace_num_target_prefix(i, num_map)
            if tp_tokenizer is not None:
                i['target_prefix'] = tp_tokenizer(i['target_prefix'])
            i['target_prefix'] = str(
                sp.encode_line(sp_decoder, i['target_prefix']))
            tp_final = format_converter(i['target_prefix'])
            tp_final[-1] = tp_final[-1].replace(']', ",")
            m_out = translator.translate_batch([i_final],
                                               beam_size=5,
                                               target_prefix=[tp_final],
                                               num_hypotheses=num_hypotheses,
                                               replace_unknowns=True)
        else:
            m_out = translator.translate_batch([i_final],
                                               beam_size=5,
                                               num_hypotheses=num_hypotheses,
                                               replace_unknowns=True)

        translation = multiple_hypothesis_decoding(m_out[0], sp_decoder)
        return translation

    except Exception as e:
        log_exception(
            "Unexpexcted error in encode_itranslate_decode: {} and {}".format(
                e,
                sys.exc_info()[0]), MODULE_CONTEXT, e)
        raise
Beispiel #2
0
    def interactive_translation(inputs):
        out = {}
        i_src, tgt = list(), list()
        tagged_tgt = list()
        tagged_src = list()
        sentence_id = list()
        tp_tokenizer = None

        try:
            for i in inputs:
                sentence_id.append(i.get("s_id") or "NA")
                if any(v not in i for v in ['src', 'id']):
                    log_info("either id or src missing in some input",
                             MODULE_CONTEXT)
                    out = CustomResponse(Status.ID_OR_SRC_MISSING.value,
                                         inputs)
                    return out

                log_info("input sentence:{}".format(i['src']), MODULE_CONTEXT)
                i_src.append(i['src'])
                i['src'] = i['src'].strip()

                i['src_lang'], i['tgt_lang'] = misc.get_src_tgt_langauge(
                    i['id'])
                i['src'] = misc.convert_digits_preprocess(
                    i['src_lang'], i['src'])

                if special_case_handler.special_case_fits(i['src']):
                    log_info(
                        "sentence fits in special case, returning accordingly and not going to model",
                        MODULE_CONTEXT)
                    translation = special_case_handler.handle_special_cases(
                        i['src'], i['id'])
                    translation = [translation]
                    tag_tgt, tag_src = translation, i['src']

                else:
                    log_info(
                        "Performing interactive translation on:{}".format(
                            i['id']), MODULE_CONTEXT)
                    i['src'], date_original, url_original, num_array, num_map = tagger_util.tag_number_date_url(
                        i['src'])
                    tag_src = i['src']

                    if i['id'] == 56:
                        "english-hindi"
                        if i['src'].isupper():
                            log_info(
                                "src all Upper case hence Tital casing it",
                                MODULE_CONTEXT)
                            i['src'] = i['src'].title()
                        tp_tokenizer = sentence_processor.indic_tokenizer
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.indic_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 7:
                        "english-tamil"
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                    elif i['id'] == 10:
                        "english-gujarati"
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                    elif i['id'] == 15:
                        "english-kannada"
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                    elif i['id'] == 16:
                        "english-telugu"
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                    elif i['id'] == 17:
                        "english-malayalam"
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                    elif i['id'] == 18:
                        "english-punjabi"
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                    elif i['id'] == 42:
                        "english-marathi"
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                    elif i['id'] == 50:
                        "telugu-english"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 6:
                        "hindi-english"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 62:
                        "marathi-english"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 8:
                        "tamil-english"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 55:
                        "punjabi-english"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 48:
                        "kannada-english"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 60:
                        "malayalam-english"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 52:
                        "gujarati-english"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 65:
                        "english-bengali 4th"
                        tp_tokenizer = sentence_processor.indic_tokenizer
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.indic_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 66:
                        "bengali-english 3rd"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 67:
                        "ta-en 3rd"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode_v2(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 68:
                        "en-ta 5th"
                        tp_tokenizer = sentence_processor.indic_tokenizer
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation = encode_itranslate_decode_v2(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.indic_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 69:
                        "hi-en 3rd"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode_v2(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 70:
                        "en-hi 15th"
                        tp_tokenizer = sentence_processor.indic_tokenizer
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation = encode_itranslate_decode_v2(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.indic_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 71:
                        "te-en 2nd"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode_v2(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 72:
                        "en-te 3rd"
                        tp_tokenizer = sentence_processor.indic_tokenizer
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation = encode_itranslate_decode_v2(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.indic_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 73:
                        "ml-en 2nd"
                        tp_tokenizer = sentence_processor.moses_tokenizer
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation = encode_itranslate_decode_v2(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.moses_detokenizer(i)
                            for i in translation
                        ]
                    elif i['id'] == 74:
                        "en-ml 3rd"
                        tp_tokenizer = sentence_processor.indic_tokenizer
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation = encode_itranslate_decode_v2(
                            i, num_map, tp_tokenizer)
                        translation = [
                            sentence_processor.indic_detokenizer(i)
                            for i in translation
                        ]

                    else:
                        log_info(
                            "unsupported model id: {} for given input".format(
                                i['id']), MODULE_CONTEXT)
                        raise Exception(
                            "Unsupported Model ID - id: {} for given input".
                            format(i['id']))

                    translation = [i.replace("▁", " ") for i in translation]
                    translation = [
                        misc.regex_pass(i, [
                            patterns['p8'], patterns['p9'], patterns['p4'],
                            patterns['p5'], patterns['p6'], patterns['p7']
                        ]) for i in translation
                    ]
                    tag_tgt = translation
                    translation = [
                        tagger_util.replace_tags_with_original(
                            i, date_original, url_original, num_array, num_map)
                        for i in translation
                    ]
                    translation = [
                        misc.convert_digits_postprocess(i['tgt_lang'], item)
                        for item in translation
                    ]
                log_info(
                    "interactive translation-experiment-{} output: {}".format(
                        i['id'], translation), MODULE_CONTEXT)
                tgt.append(translation)
                tagged_tgt.append(tag_tgt)
                tagged_src.append(tag_src)

            out['response_body'] = [{
                "tgt": tgt[i],
                "tagged_tgt": tagged_tgt[i],
                "tagged_src": tagged_src[i],
                "s_id": sentence_id[i],
                "src": i_src[i]
            } for i in range(len(tgt))]
            out = CustomResponse(Status.SUCCESS.value, out['response_body'])
        except Exception as e:
            status = Status.SYSTEM_ERR.value
            status['why'] = str(e)
            log_exception(
                "Unexpected error:%s and %s" % (e, sys.exc_info()[0]),
                MODULE_CONTEXT, e)
            out = CustomResponse(status, inputs)

        return out
Beispiel #3
0
    def translate_func(inputs):

        inputs = inputs
        out = {}
        pred_score = list()
        sentence_id, node_id = list(), list()
        input_subwords, output_subwords = list(), list()
        i_src, tgt = list(), list()
        tagged_tgt, tagged_src = list(), list()
        s_id, n_id = [0000], [0000]
        i_s0_src, i_s0_tgt, i_save = list(), list(), list()
        i_tmx_phrases = list()

        try:
            for i in inputs:
                s0_src, s0_tgt, save = "NA", "NA", False
                if all(v in i for v in ['s_id', 'n_id']):
                    s_id = [i['s_id']]
                    n_id = [i['n_id']]

                if any(v not in i for v in ['src', 'id']):
                    log_info("either id or src missing in some input",
                             MODULE_CONTEXT)
                    out = CustomResponse(Status.ID_OR_SRC_MISSING.value,
                                         inputs)
                    return out

                if any(v in i for v in ['s0_src', 's0_tgt', 'save']):
                    s0_src, s0_tgt, save = handle_custome_input(
                        i, s0_src, s0_tgt, save)

                i_s0_src.append(s0_src), i_s0_tgt.append(
                    s0_tgt), i_save.append(save)

                log_info("input sentences:{}".format(i['src']), MODULE_CONTEXT)
                i_src.append(i['src'])
                i['src'] = i['src'].strip()

                src_language, tgt_language = misc.get_src_tgt_langauge(i['id'])
                if src_language == 'English' and i['src'].isupper():
                    i['src'] = i['src'].title()
                i['src'] = misc.convert_digits_preprocess(
                    src_language, i['src'])

                if special_case_handler.special_case_fits(i['src']):
                    log_info(
                        "sentence fits in special case, returning accordingly and not going to model",
                        MODULE_CONTEXT)
                    translation = special_case_handler.handle_special_cases(
                        i['src'], i['id'])
                    scores = [1]
                    input_sw, output_sw, tag_tgt, tag_src = "", "", translation, i[
                        'src']

                else:
                    log_info("translating using NMT-model:{}".format(i['id']),
                             MODULE_CONTEXT)
                    prefix, i['src'] = special_case_handler.prefix_handler(
                        i['src'])
                    i['src'], date_original, url_original, num_array, num_map = tagger_util.tag_number_date_url(
                        i['src'])
                    tag_src = (prefix + " " + i['src']).lstrip()

                    i['src'], is_missing_stop_punc = special_case_handler.handle_a_sentence_wo_stop(
                        src_language, i['src'])

                    if i['id'] == 6:
                        "hi-en_exp-2 05-05-20"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)

                    elif i['id'] == 7:
                        "english-tamil"
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                    elif i['id'] == 10:
                        "english-gujrati"
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                    elif i['id'] == 15:
                        "english-kannada"
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                    elif i['id'] == 16:
                        "english-telgu"
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                    elif i['id'] == 17:
                        "english-malayalam"
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                    elif i['id'] == 18:
                        "english-punjabi"
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                    elif i['id'] == 42:
                        "english-marathi exp-2"
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                    elif i['id'] == 56:
                        "09/12/19-Exp-5.6:"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 8:
                        "ta-en 1st"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 44:
                        "eng-mr-3rd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 47:
                        "en-kn 2nd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 48:
                        "kn-en 1st"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 49:
                        "en-tel 2nd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 50:
                        "tel-en 1st"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 51:
                        "en-guj 2nd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 52:
                        "guj-en 1st"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 53:
                        "en-punjabi 2nd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 55:
                        "punjabi-en 1st"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 57:
                        "en-bengali 3rd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 58:
                        "bengali-en 2nd"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 59:
                        "en-malay 2nd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 60:
                        "malay-en 1st"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 62:
                        "mr-to-en 2nd"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 65:
                        "en-bengali 4th"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 66:
                        "bengali-en 3rd"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 67:
                        "ta-en 3rd"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode_v2(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 68:
                        "en-ta 5th"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode_v2(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 69:
                        "hi-en 3rd"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode_v2(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 70:
                        "en-hi 15th"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode_v2(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 71:
                        "te-en 2nd"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode_v2(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 72:
                        "en-te 3rd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode_v2(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    elif i['id'] == 73:
                        "ml-en 2nd"
                        i['src'] = sentence_processor.indic_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode_v2(
                            i)
                        translation = sentence_processor.moses_detokenizer(
                            translation)
                    elif i['id'] == 74:
                        "en-ml 3rd"
                        i['src'] = sentence_processor.moses_tokenizer(i['src'])
                        translation, scores, input_sw, output_sw = encode_translate_decode_v2(
                            i)
                        translation = sentence_processor.indic_detokenizer(
                            translation)
                    else:
                        log_info(
                            "Unsupported model id: {} for given input".format(
                                i['id']), MODULE_CONTEXT)
                        raise Exception(
                            "Unsupported Model ID - id: {} for given input".
                            format(i['id']))

                    translation = oc.postprocess_a_sentence_wo_stop(
                        tgt_language, translation, is_missing_stop_punc)
                    translation = (prefix + " " + translation).lstrip()
                    translation = translation.replace("▁", " ")
                    translation = misc.regex_pass(translation, [
                        patterns['p8'], patterns['p9'], patterns['p4'],
                        patterns['p5'], patterns['p6'], patterns['p7']
                    ])
                    tag_tgt = translation
                    translation = tagger_util.replace_tags_with_original(
                        translation, date_original, url_original, num_array,
                        num_map)
                    translation = oc.cleaner(tag_src, translation, i['id'])
                    translation = misc.convert_digits_postprocess(
                        tgt_language, translation)
                log_info(
                    "translate_function-experiment-{} output: {}".format(
                        i['id'], translation), MODULE_CONTEXT)
                tgt.append(translation)
                pred_score.append(scores)
                sentence_id.append(s_id[0]), node_id.append(n_id[0])
                input_subwords.append(input_sw), output_subwords.append(
                    output_sw)
                tagged_tgt.append(tag_tgt), tagged_src.append(tag_src)
                i_tmx_phrases.append(i.get("tmx_phrases", []))

            out['response_body'] = [{
                "tgt": tgt[i],
                "pred_score": pred_score[i],
                "s_id": sentence_id[i],
                "input_subwords": input_subwords[i],
                "output_subwords": output_subwords[i],
                "n_id": node_id[i],
                "src": i_src[i],
                "tagged_tgt": tagged_tgt[i],
                "tagged_src": tagged_src[i],
                "save": i_save[i],
                "s0_src": i_s0_src[i],
                "s0_tgt": i_s0_tgt[i],
                "tmx_phrases": i_tmx_phrases[i]
            } for i in range(len(tgt))]
            out = CustomResponse(Status.SUCCESS.value, out['response_body'])
        except ServerModelError as e:
            status = Status.SEVER_MODEL_ERR.value
            status['why'] = str(e)
            log_exception(
                "ServerModelError error in TRANSLATE_UTIL-translate_func: {} and {}"
                .format(e,
                        sys.exc_info()[0]), MODULE_CONTEXT, e)
            out = CustomResponse(status, inputs)
        except Exception as e:
            status = Status.SYSTEM_ERR.value
            status['why'] = str(e)
            log_exception(
                "Unexpected error:%s and %s" % (e, sys.exc_info()[0]),
                MODULE_CONTEXT, e)
            out = CustomResponse(status, inputs)

        return out
Beispiel #4
0
    def batch_translator(input_dict):

        model_id = input_dict['id']
        src_list = input_dict['src_list']
        num_sentence = len(src_list)
        input_subwords_list = [None] * num_sentence
        output_subwords_list = [None] * num_sentence
        tagged_src_list = [None] * num_sentence
        tagged_tgt_list = [None] * num_sentence
        tgt_list = [None] * num_sentence
        score_list = [None] * num_sentence
        out = {}

        date_original_array = [None] * num_sentence
        url_original_array = [None] * num_sentence
        num_array_array = [None] * num_sentence
        num_map_array = [None] * num_sentence
        prefix_array = [None] * num_sentence

        sp_encoder, translator, sp_decoder = get_models(model_id)

        input_sentence_array_prepd = [None] * num_sentence
        special_case_sentence_indices = []

        src_language, tgt_language = misc.get_src_tgt_langauge(model_id)

        try:
            for i, sent in enumerate(src_list):
                input_sentence = sent.strip()

                input_sentence = misc.convert_digits_preprocess(
                    src_language, input_sentence)

                if special_case_handler.special_case_fits(input_sentence):
                    special_case_sentence_indices.append(i)
                    log_info(
                        "sentence fits in special case, capturing index to process at last",
                        MODULE_CONTEXT)
                else:
                    prefix_array[
                        i], input_sentence = special_case_handler.prefix_handler(
                            input_sentence)
                    input_sentence,date_original_array[i],url_original_array[i],num_array_array[i],num_map_array[i] = \
                        tagger_util.tag_number_date_url(input_sentence)
                    tagged_src_list[i] = (prefix_array[i] + " " +
                                          input_sentence).lstrip()

                input_sentence_array_prepd[i] = input_sentence

            input_sentence_array_prepd, sent_indices_wo_stop = \
                special_case_handler.handle_sentences_wo_stop(src_language,input_sentence_array_prepd)

            log_info("translating using NMT-model:{}".format(model_id),
                     MODULE_CONTEXT)
            if model_id == 5:
                "hi-en exp-1"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 6:
                "hi-en_exp-2 05-05-20"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]

            elif model_id == 7:
                "english-tamil"
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
            elif model_id == 10:
                "english-gujrati"
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                # translation = translation.replace("ન્યાય માટે Accessક્સેસને","ન્યાયની પહોંચને")
            elif model_id == 11:
                "english-bengali"
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)

            elif model_id == 15:
                "english-kannada"
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                # translation = translation.replace("uc","")
            elif model_id == 16:
                "english-telgu"
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
            elif model_id == 17:
                "english-malayalam"
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
            elif model_id == 18:
                "english-punjabi"
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
            elif model_id == 32:
                "29/10/2019 Exp-12: old_data_original+lc_cleaned+ ik names translated from google(100k)+shabdkosh(appended 29k new),BPE-24K,50knmt,shuff,pretok"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 42:
                "english-marathi exp-2"
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
            elif model_id == 56:
                "09/12/19-Exp-5.6:"
                input_sentence_array_prepd = [
                    sentence.title() if sentence.isupper() else sentence
                    for sentence in input_sentence_array_prepd
                ]
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 8:
                "ta-en 1st"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 44:
                "eng-mr-3rd"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 45:
                "en-ta 4th"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 47:
                "en-kn 2nd"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 48:
                "kn-en 1st"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 49:
                "en-tel 2nd"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 50:
                "tel-en 1st"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 51:
                "en-guj 2nd"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 52:
                "guj-en 1st"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 53:
                "en-punjabi 2nd"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 55:
                "punjabi-en 1st"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 57:
                "en-bengali 3rd"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 58:
                "bengali-en 2nd"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 59:
                "en-malay 2nd"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 60:
                "malay-en 1st"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 61:
                "ta-to-en 3rd"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 62:
                "mr-to-en 2nd"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 65:
                "en-bengali 4th"
                input_sentence_array_prepd = [
                    sentence_processor.moses_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.indic_detokenizer(translation)
                    for translation in translation_array
                ]
            elif model_id == 66:
                "bengali-en 3rd"
                input_sentence_array_prepd = [
                    sentence_processor.indic_tokenizer(sentence)
                    for sentence in input_sentence_array_prepd
                ]
                translation_array, input_subwords_list, output_subwords_list, score_list = \
                encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list)
                translation_array = [
                    sentence_processor.moses_detokenizer(translation)
                    for translation in translation_array
                ]
            else:
                log_info(
                    "Unsupported model id: {} for given input".format(
                        model_id), MODULE_CONTEXT)
                raise Exception(
                    "Unsupported Model ID - id: {} for given input".format(
                        model_id))

            translation_array = oc.postprocess_sentences_wo_stop(
                tgt_language, translation_array, sent_indices_wo_stop)

            for i in range(num_sentence):

                if i in special_case_sentence_indices:
                    log_info(
                        "sentence fits in special case, returning output accordingly and not from model",
                        MODULE_CONTEXT)
                    tgt_list[i] = special_case_handler.handle_special_cases(
                        src_list[i].strip(), model_id)
                    score_list[i] = 1
                    input_subwords_list[i],output_subwords_list[i],tagged_tgt_list[i],tagged_src_list[i] = \
                        "","",tgt_list[i],src_list[i].strip()
                else:
                    translation_array[i] = (prefix_array[i] + " " +
                                            translation_array[i]).lstrip()
                    translation_array[i] = translation_array[i].replace(
                        "▁", " ")
                    translation_array[i] = misc.regex_pass(
                        translation_array[i], [
                            patterns['p8'], patterns['p9'], patterns['p4'],
                            patterns['p5'], patterns['p6'], patterns['p7']
                        ])
                    tagged_tgt_list[i] = translation_array[i]
                    translation_array[i] = tagger_util.replace_tags_with_original(translation_array[i],\
                        date_original_array[i],url_original_array[i],num_array_array[i],num_map_array[i])
                    translation_array[i] = oc.cleaner(tagged_src_list[i],
                                                      translation_array[i],
                                                      model_id)
                    tgt_list[i] = translation_array[i]
                    log_info(
                        "translate_function-experiment-{} output: {}".format(
                            model_id, translation_array[i]), MODULE_CONTEXT)

                tgt_list[i] = misc.convert_digits_postprocess(
                    tgt_language, tgt_list[i])

                if (not tgt_list[i]) or (tgt_list[i].isspace()):
                    tgt_list[i] = src_list[i]

            out = {
                "tagged_src_list": tagged_src_list,
                "tagged_tgt_list": tagged_tgt_list,
                "tgt_list": tgt_list
            }
        except ServerModelError as e:
            log_exception(
                "ServerModelError error in TRANSLATE_UTIL-translate_func: {} and {}"
                .format(e,
                        sys.exc_info()[0]), MODULE_CONTEXT, e)
            raise e
        except Exception as e:
            log_exception(
                "Exception caught in NMTTranslateService:batch_translator:%s and %s"
                % (e, sys.exc_info()[0]), MODULE_CONTEXT, e)
            raise e

        return out