Ejemplo n.º 1
0
    def process_txt_sentences(self, txt_sentences, strip_eos=True):
        ret_tgt = []
        for sentence_txt in txt_sentences:
            sentence = normalize_string(sentence_txt)
            if self.normalize_sal_entities:
                sentence, e2, rlookup = normalize_sal_entities(sentence, "")
            else:
                rlookup = None
            if self.reorder_numbered_placeholders:
                sentence, _, rlookup2 = reorder_numbered_placeholders(
                    sentence, "")
            if rlookup is None:
                rlookup = rlookup2
            else:
                rlookup.update(rlookup2)
            gen_tgt_toks, _ = self.seq2seq.generate(sentence)
            gen_tgt_toks = [t for t in gen_tgt_toks
                            if len(t) > 0]  # Fix to remove double empties
            if strip_eos:
                if gen_tgt_toks[-1] == '<EOS>':
                    gen_tgt_toks = gen_tgt_toks[0:-1]
            if self.normalize_sal_entities or self.reorder_numbered_placeholders:
                gen_tgt_toks = reinsert_from_lookup(gen_tgt_toks, rlookup)
            ret_str = " ".join(gen_tgt_toks)
            if self.convert_to_json:
                if self.output_lang_name == 'sexp':
                    ret_str = convert_sexp2json(ret_str)
                elif self.output_lang_name == 'pn':
                    ret_str = convert_pn2json(ret_str)


#                    ret_str = convert_sexp2json(convert_pn2sexp(ret_str))
            ret_tgt.append(ret_str)
        return ret_tgt
Ejemplo n.º 2
0
 def inner_fn(l):
     normed_pairs = [normalize_string(s) for s in l.split('\t')]
     pair_str1, pair_str2 = normed_pairs[0], normed_pairs[1]
     if normalize_sal_entities:
         pair_str1, pair_str2, _ = normalize_sal_entities(
             pair_str1, pair_str2)
     if reorder_numplaceholders:
         pair_str1, pair_str2, _ = reorder_numbered_placeholders(
             pair_str1, pair_str2)
     return [pair_str1, pair_str2]
Ejemplo n.º 3
0
    def generate(self,
                 sentence,
                 max_length=MAX_LENGTH,
                 include_attentions=False):
        sentence = normalize_string(sentence)
        with torch.no_grad():
            input_tensor = tensor_from_sentence(self.input_lang, sentence,
                                                self.device)
            input_length = input_tensor.size()[0]
            encoder_hidden = self.encoder.initHidden()

            encoder_outputs = torch.zeros(max_length,
                                          self.encoder.hidden_size,
                                          device=self.device)

            for ei in range(input_length):
                encoder_output, encoder_hidden = self.encoder(
                    input_tensor[ei], encoder_hidden)
                encoder_outputs[ei] += encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]],
                                         device=self.device)  # SOS

            decoder_hidden = encoder_hidden

            decoded_words = []
            if include_attentions:
                decoder_attentions = torch.zeros(max_length, max_length)
            else:
                decoder_attentions = None

            for di in range(max_length):
                decoder_output, decoder_hidden, decoder_attention = self.decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                if include_attentions:
                    decoder_attentions[di] = decoder_attention.data
                topv, topi = decoder_output.data.topk(1)
                if topi.item() == EOS_token:
                    decoded_words.append('<EOS>')
                    break
                else:
                    decoded_words.append(
                        self.output_lang.index2word[topi.item()])

                decoder_input = topi.squeeze().detach()

            if include_attentions:
                return decoded_words, decoder_attentions[:di + 1]
            else:
                return decoded_words, None
Ejemplo n.º 4
0
 def process_sentences(self, sentence_dicts):
     accum_cst = []
     for sentence_dict in sentence_dicts:
         try:
             sid = sentence_dict['id']
             if 'sentence' in sentence_dict:
                 # Old format
                 raw_sentence = sentence_dict['sentence']
             else:
                 # New format
                 raw_sentence = sentence_dict['new-text']
             if self.verbose:
                 print("sid={}, raw={}".format(sid, raw_sentence))
             # For each sentence, identify the IDs first, normalize them, and then replace them
             sentence = normalize_string(raw_sentence)
             if self.normalize_sal_entities:
                 sentence, e2, rlookup = normalize_sal_entities(
                     sentence, "")
             else:
                 rlookup = {}
             if self.reorder_numbered_placeholders:
                 sentence, _, rlookup2 = reorder_numbered_placeholders(
                     sentence, "")
                 rlookup.update(rlookup2)
             if self.verbose:
                 print("Placeholder checks done")
             gen_cst_toks, _ = self.seq2seq.generate(sentence)
             if self.verbose:
                 print("... generated")
             # Hack to fix empty tokens introduced before '!=' (inequality) functions
             gen_cst_toks = [t for t in gen_cst_toks if len(t) > 0]
             if self.normalize_sal_entities or self.reorder_numbered_placeholders:
                 print("RLookup={}".format(rlookup))
                 gen_cst_toks = reinsert_from_lookup(gen_cst_toks, rlookup)
             if self.output_lang_name.lower(
             ) == "json" or self.convert_to_json:
                 gen_str_form = " ".join(gen_cst_toks).replace(
                     "<EOS>", "").replace("<SOS>", "")
             else:
                 # Apply JSON-ification only if the target is not JSON
                 gen_str_form = gen_toks2dict(gen_cst_toks)
             if self.convert_to_json:
                 if self.output_lang_name == 'sexp':
                     gen_str_form = convert_sexp2json(gen_str_form)
                 elif self.output_lang_name == 'pn':
                     #debugging - making this call prints out before and after
                     # gen_str_form = convert_pn2json(gen_str_form)
                     gen_str_form = convert_pn2pn(gen_str_form)
                     print(gen_str_form)
             gen_cst_dict = json.loads(gen_str_form)
             if self.verbose:
                 print("... remapping done")
             if self.include_normed_forms:
                 accum_cst.append({
                     "id": sid,
                     "nl": raw_sentence,
                     "cst": gen_cst_dict,
                     "normed_form": sentence
                 })
             else:
                 accum_cst.append({
                     "id": sid,
                     "nl": raw_sentence,
                     "cst": gen_cst_dict
                 })
         except JSONDecodeError:
             print("Invalid JSON! Value={}".format(gen_str_form))
             if self.include_normed_forms:
                 accum_cst.append({
                     "id": sid,
                     "error": "Invalid JSON",
                     "normed_form": sentence,
                     "nl": raw_sentence,
                     "cst_attempted": gen_str_form
                 })
             else:
                 accum_cst.append({
                     "id":
                     sid,
                     "error":
                     "Invalid JSON, value={}".format(gen_str_form)
                 })
     #     except:
     #         if self.include_normed_forms:
     #             accum_cst.append({ "id" : sid, "error" : "Unhandled exception", "normed_form" : sentence })
     #         else:
     #             accum_cst.append({ "id" : sid, "error" : "Unhandled exception, e={}".format(sys.exc_info()) })
     root_elt = {"sentences": accum_cst}
     return json.dumps(root_elt, indent=3)