Ejemplo n.º 1
0
    def __init__(self, article, abstract_sentences, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Ejemplo n.º 2
0
    def __init__(self, article, abstract_sentences, vocab):
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)
            # print('enc_input_extend_vocab',self.enc_input_extend_vocab)
            # print('article_oovs',self.article_oovs)
            # print(data.outputids2words(self.enc_input_extend_vocab,vocab,self.article_oovs))

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)
            # print('abstract_words',abstract_words)
            # print('abs_ids_extend_vocab',abs_ids_extend_vocab)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Ejemplo n.º 3
0
    def __init__(self, article, abstract_sentences, vocab):
        """
        :param article: raw article string
        :param abstract_sentences: list of raw abstract sentences
        :param vocab: a vocabulary object
        """
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]

        self.enc_len = len(article_words)
        self.enc_input = [vocab.word2id(w) for w in article_words]

        # Process the abstract
        abstract = ' '.join(abstract_sentences)
        abstract_words = abstract.split()
        abs_ids = [vocab.word2id(w) for w in abstract_words]

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_tgt_seqs(
            abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:
            self.enc_input_extend_vocab, self.article_oov = data.article2ids(
                article_words, vocab)
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oov)
            _, self.target = self.get_dec_inp_tgt_seqs(abs_ids_extend_vocab,
                                                       config.max_dec_steps,
                                                       start_decoding,
                                                       stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sentences = abstract_sentences
Ejemplo n.º 4
0
    def __init__(self, article, abstract_sentences, vocab):
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        article_words = article.split()

        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(article_words)

        # list of word ids; OOVs are represented by the id for UNK token
        self.enc_input = [vocab.word2id(w) for w in article_words]

        # Process the abstract
        abstract = ' '.join(abstract_sentences)
        abstract_words = abstract.split()
        # list of word ids; OOVs are represented by the id for UNK token
        abs_ids = [vocab.word2id(w) for w in abstract_words]

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids,
                                                                 config.max_dec_steps,
                                                                 start_decoding,
                                                                 stop_decoding)
        self.dec_len = len(self.dec_input)

        if config.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id;
            # also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                                                        stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Ejemplo n.º 5
0
  def __init__(self, article_sections, abstract_sentences, vocab, article_sents, similarity_scores):
    # Get ids of special tokens
    start_decoding = vocab.word2id(data.START_DECODING)
    stop_decoding = vocab.word2id(data.STOP_DECODING)

    section_words = []
    self.enc_lens = []
    self.enc_inputs = []

    self.sent_lens = []

    # Process the article
    for i, section in enumerate(article_sections[:config.max_num_sections]):
      current_article_sents = [x.split() for x in article_sents[i]]
      words = section.split()
      if len(words) > config.max_section_size:
        words_to_remove = len(words) - config.max_section_size
        words = words[:config.max_section_size]

        indx = 0
        inner_stay = 0
        for j, sent in enumerate(reversed(current_article_sents)):
          indx += len(sent)
          if indx > words_to_remove:
            inner_stay = indx - words_to_remove
            break

        if j > 0:
          current_article_sents = current_article_sents[: -j]
        if inner_stay > 0:
          current_article_sents[-1] = current_article_sents[-1][: inner_stay]

      assert sum([len(x) for x in current_article_sents]) == len(words), "Bug in sent filtering!"

      self.enc_lens.append(len(words))
      self.enc_inputs.append([vocab.word2id(w) for w in words]) # list of word ids; OOVs are represented by the id for UNK token

      self.sent_lens.append([len(x) for x in current_article_sents])

      section_words.append(words)

    self.num_sections = len(section_words)
    self.max_enc_len = max(self.enc_lens)
    self.max_num_sents = max([len(x) for x in self.sent_lens])

    # Process the abstract
    abstract = ' '.join(abstract_sentences) # string
    abstract_words = abstract.split() # list of strings
    abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token

    # Get the decoder input sequence and target sequence
    self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
    self.dec_len = len(self.dec_input)

    # If using pointer-generator mode, we need to store some extra info
    if config.pointer_gen:
        # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
      self.enc_inputs_extend_vocab, self.article_oovs = data.article2ids(section_words, vocab)

      # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
      abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

      # Overwrite decoder target sequence so it uses the temp article OOV ids
      _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding)

    # Process the similarity scores
    self.similarity_scores = []
    assert len(similarity_scores) == sum([len(x) for x in article_sents])
    global_sent_index = 0
    for i, section_sent_lens in enumerate(self.sent_lens):
      num_sents_cut = len(section_sent_lens)
      self.similarity_scores += similarity_scores[global_sent_index : global_sent_index + num_sents_cut]
      global_sent_index += len(article_sents[i])

    # Store the original strings
    self.original_article = article_sections
    self.original_abstract = abstract
    self.original_abstract_sents = abstract_sentences
    self.original_article_sents = article_sents
Ejemplo n.º 6
0
def get_a_batch(ami_data, idx, vocab, batch_size, max_enc_steps, max_dec_steps,
                start_id, stop_id, pad_id, sum_type, use_cuda):
    if sum_type not in ['long', 'short']:
        raise Exception("summary type long/short only")

    example_list = [None for _ in range(batch_size)]

    for bn in range(batch_size):
        topic_segments = ami_data[idx + bn][0]
        if sum_type == 'long': encoded_summary = ami_data[idx + bn][1]
        elif sum_type == 'short': encoded_summary = ami_data[idx + bn][2]
        # input
        meeting_words = []
        for segment in topic_segments:
            utterances = segment.utterances
            for utterance in utterances:
                encoded_words = utterance.encoded_words
                meeting_words += encoded_words

        meeting_word_string = bert_tokenizer.decode(meeting_words)
        # summary
        summary_string = bert_tokenizer.decode(encoded_summary)
        summary_string = summary_string.replace('[CLS]', '')
        summary_string = summary_string.replace('[MASK]', '')
        summary_string = summary_string.replace('[SEP]', '')

        # create an example
        article_words = word_tokenize(meeting_word_string)
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        enc_len = len(article_words)
        enc_input = [vocab.word2id(w) for w in article_words]

        abstract_words = word_tokenize(summary_string)
        abs_ids = [vocab.word2id(w) for w in abstract_words]

        dec_input, target = get_dec_inp_targ_seqs(abs_ids,
                                                  config.max_dec_steps,
                                                  start_id, stop_id)
        dec_len = len(dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            enc_input_extend_vocab, article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, target = get_dec_inp_targ_seqs(abs_ids_extend_vocab,
                                              config.max_dec_steps, start_id,
                                              stop_id)
        else:
            enc_input_extend_vocab = None
            article_oovs = None

        example = Example(enc_input, enc_len, dec_input, dec_len, target,
                          enc_input_extend_vocab, article_oovs)

        example_list[bn] = example

    ###################### init encoder seq ######################
    max_enc_seq_len = max([ex.enc_len for ex in example_list])
    for ex in example_list:
        ex.pad_encoder_input(max_enc_seq_len, pad_id)

    # Initialize the numpy arrays
    # Note: our enc_batch can have different length (second dimension) for each batch because we use dynamic_rnn for the encoder.
    enc_batch = np.zeros((batch_size, max_enc_seq_len), dtype=np.int32)
    enc_lens = np.zeros((batch_size), dtype=np.int32)
    enc_padding_mask = np.zeros((batch_size, max_enc_seq_len),
                                dtype=np.float32)
    # Fill in the numpy arrays
    for i, ex in enumerate(example_list):
        enc_batch[i, :] = ex.enc_input[:]
        enc_lens[i] = ex.enc_len
        for j in range(ex.enc_len):
            enc_padding_mask[i][j] = 1

    # For pointer-generator mode, need to store some extra info
    if config.pointer_gen:
        # Determine the max number of in-article OOVs in this batch
        max_art_oovs = max([len(ex.article_oovs) for ex in example_list])
        # Store the in-article OOVs themselves
        art_oovs = [ex.article_oovs for ex in example_list]
        # Store the version of the enc_batch that uses the article OOV ids
        enc_batch_extend_vocab = np.zeros((batch_size, max_enc_seq_len),
                                          dtype=np.int32)
        for i, ex in enumerate(example_list):
            enc_batch_extend_vocab[i, :] = ex.enc_input_extend_vocab[:]

    ###################### init decoder seq ######################
    # Pad the inputs and targets
    for ex in example_list:
        ex.pad_decoder_inp_targ(config.max_dec_steps, pad_id)

    # Initialize the numpy arrays.
    dec_batch = np.zeros((batch_size, config.max_dec_steps), dtype=np.int32)
    target_batch = np.zeros((batch_size, config.max_dec_steps), dtype=np.int32)
    dec_padding_mask = np.zeros((batch_size, config.max_dec_steps),
                                dtype=np.float32)
    dec_lens = np.zeros((batch_size), dtype=np.int32)

    # Fill in the numpy arrays
    for i, ex in enumerate(example_list):
        dec_batch[i, :] = ex.dec_input[:]
        target_batch[i, :] = ex.target[:]
        dec_lens[i] = ex.dec_len
        for j in range(ex.dec_len):
            dec_padding_mask[i][j] = 1

    # ------------------------------------------------------------------------------- #
    # ---------------- get_input_from_batch , get_output_from_batch ----------------- #
    # ------------------------------------------------------------------------------- #
    # get_input_from_batch
    enc_batch = Variable(torch.from_numpy(enc_batch).long())
    enc_padding_mask = Variable(torch.from_numpy(enc_padding_mask)).float()

    if config.pointer_gen:
        enc_batch_extend_vocab = Variable(
            torch.from_numpy(enc_batch_extend_vocab).long())
        # max_art_oovs is the max over all the article oov list in the batch
        if max_art_oovs > 0:
            extra_zeros = Variable(torch.zeros((batch_size, max_art_oovs)))

    c_t_1 = Variable(torch.zeros((batch_size, 2 * config.hidden_dim)))

    coverage = None
    if config.is_coverage:
        coverage = Variable(torch.zeros(enc_batch.size()))

    if use_cuda:
        enc_batch = enc_batch.cuda()
        enc_padding_mask = enc_padding_mask.cuda()

        if enc_batch_extend_vocab is not None:
            enc_batch_extend_vocab = enc_batch_extend_vocab.cuda()
        if extra_zeros is not None:
            extra_zeros = extra_zeros.cuda()
        c_t_1 = c_t_1.cuda()

        if coverage is not None:
            coverage = coverage.cuda()

    # get_output_from_batch
    dec_batch = Variable(torch.from_numpy(dec_batch).long())
    dec_padding_mask = Variable(torch.from_numpy(dec_padding_mask)).float()
    max_dec_len = np.max(dec_lens)
    dec_lens_var = Variable(torch.from_numpy(dec_lens)).float()
    target_batch = Variable(torch.from_numpy(target_batch)).long()

    if use_cuda:
        dec_batch = dec_batch.cuda()
        dec_padding_mask = dec_padding_mask.cuda()
        dec_lens_var = dec_lens_var.cuda()
        target_batch = target_batch.cuda()

    return (enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab,
            extra_zeros, c_t_1, coverage), (dec_batch, dec_padding_mask,
                                            max_dec_len, dec_lens_var,
                                            target_batch)
Ejemplo n.º 7
0
    def __init__(self, article, abstract_sentences, vocab):
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article # đây là một văn bản ban đầu
        article = article.decode('utf-8')
        article_words = article.split()
        #cắt thành các câu
        specific_cnn = ['-lrb-', 'cnn', '-rrb-', '-lsb-', '-rsb-', '--']
        sentences = sent_tokenize(article)

        src_words = []
        for sen in sentences:
            word = word_tokenize(sen)
            #-lrb- cnn -rrb-
            word = [i for i in word if i not in specific_cnn]
            src_words.append(word)
        # print(src_words)
        self.enc_len_origin = len(src_words)
        self.enc_nsents = len(sentences)

        #tạo ra input cho bert

        self.enc_subtoken_ids, self.enc_segments_ids, self.enc_cls_ids, self.enc_sep_ids, self.enc_len_cls, self.enc_len_sep = bert_pro.preprocess(
            src_words)
        self.enc_len = len(
            self.enc_subtoken_ids
        )  # store the length after truncation but before padding
        # article_words = config.tokenizer.tokenize(article)
        # # print(article_words)
        # if len(article_words) > config.max_enc_steps:
        #   article_words = article_words[:config.max_enc_steps]
        # self.enc_len = len(article_words)
        # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        # print("A new abstract \n",abstract)
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token
        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences