def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # print('enc_input_extend_vocab',self.enc_input_extend_vocab) # print('article_oovs',self.article_oovs) # print(data.outputids2words(self.enc_input_extend_vocab,vocab,self.article_oovs)) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # print('abstract_words',abstract_words) # print('abs_ids_extend_vocab',abs_ids_extend_vocab) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab): """ :param article: raw article string :param abstract_sentences: list of raw abstract sentences :param vocab: a vocabulary object """ # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len(article_words) self.enc_input = [vocab.word2id(w) for w in article_words] # Process the abstract abstract = ' '.join(abstract_sentences) abstract_words = abstract.split() abs_ids = [vocab.word2id(w) for w in abstract_words] # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_tgt_seqs( abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: self.enc_input_extend_vocab, self.article_oov = data.article2ids( article_words, vocab) abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oov) _, self.target = self.get_dec_inp_tgt_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sentences = abstract_sentences
def __init__(self, article, abstract_sentences, vocab): start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len(article_words) # list of word ids; OOVs are represented by the id for UNK token self.enc_input = [vocab.word2id(w) for w in article_words] # Process the abstract abstract = ' '.join(abstract_sentences) abstract_words = abstract.split() # list of word ids; OOVs are represented by the id for UNK token abs_ids = [vocab.word2id(w) for w in abstract_words] # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; # also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article_sections, abstract_sentences, vocab, article_sents, similarity_scores): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) section_words = [] self.enc_lens = [] self.enc_inputs = [] self.sent_lens = [] # Process the article for i, section in enumerate(article_sections[:config.max_num_sections]): current_article_sents = [x.split() for x in article_sents[i]] words = section.split() if len(words) > config.max_section_size: words_to_remove = len(words) - config.max_section_size words = words[:config.max_section_size] indx = 0 inner_stay = 0 for j, sent in enumerate(reversed(current_article_sents)): indx += len(sent) if indx > words_to_remove: inner_stay = indx - words_to_remove break if j > 0: current_article_sents = current_article_sents[: -j] if inner_stay > 0: current_article_sents[-1] = current_article_sents[-1][: inner_stay] assert sum([len(x) for x in current_article_sents]) == len(words), "Bug in sent filtering!" self.enc_lens.append(len(words)) self.enc_inputs.append([vocab.word2id(w) for w in words]) # list of word ids; OOVs are represented by the id for UNK token self.sent_lens.append([len(x) for x in current_article_sents]) section_words.append(words) self.num_sections = len(section_words) self.max_enc_len = max(self.enc_lens) self.max_num_sents = max([len(x) for x in self.sent_lens]) # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_inputs_extend_vocab, self.article_oovs = data.article2ids(section_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Process the similarity scores self.similarity_scores = [] assert len(similarity_scores) == sum([len(x) for x in article_sents]) global_sent_index = 0 for i, section_sent_lens in enumerate(self.sent_lens): num_sents_cut = len(section_sent_lens) self.similarity_scores += similarity_scores[global_sent_index : global_sent_index + num_sents_cut] global_sent_index += len(article_sents[i]) # Store the original strings self.original_article = article_sections self.original_abstract = abstract self.original_abstract_sents = abstract_sentences self.original_article_sents = article_sents
def get_a_batch(ami_data, idx, vocab, batch_size, max_enc_steps, max_dec_steps, start_id, stop_id, pad_id, sum_type, use_cuda): if sum_type not in ['long', 'short']: raise Exception("summary type long/short only") example_list = [None for _ in range(batch_size)] for bn in range(batch_size): topic_segments = ami_data[idx + bn][0] if sum_type == 'long': encoded_summary = ami_data[idx + bn][1] elif sum_type == 'short': encoded_summary = ami_data[idx + bn][2] # input meeting_words = [] for segment in topic_segments: utterances = segment.utterances for utterance in utterances: encoded_words = utterance.encoded_words meeting_words += encoded_words meeting_word_string = bert_tokenizer.decode(meeting_words) # summary summary_string = bert_tokenizer.decode(encoded_summary) summary_string = summary_string.replace('[CLS]', '') summary_string = summary_string.replace('[MASK]', '') summary_string = summary_string.replace('[SEP]', '') # create an example article_words = word_tokenize(meeting_word_string) if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] enc_len = len(article_words) enc_input = [vocab.word2id(w) for w in article_words] abstract_words = word_tokenize(summary_string) abs_ids = [vocab.word2id(w) for w in abstract_words] dec_input, target = get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_id, stop_id) dec_len = len(dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves enc_input_extend_vocab, article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, target = get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_id, stop_id) else: enc_input_extend_vocab = None article_oovs = None example = Example(enc_input, enc_len, dec_input, dec_len, target, enc_input_extend_vocab, article_oovs) example_list[bn] = example ###################### init encoder seq ###################### max_enc_seq_len = max([ex.enc_len for ex in example_list]) for ex in example_list: ex.pad_encoder_input(max_enc_seq_len, pad_id) # Initialize the numpy arrays # Note: our enc_batch can have different length (second dimension) for each batch because we use dynamic_rnn for the encoder. enc_batch = np.zeros((batch_size, max_enc_seq_len), dtype=np.int32) enc_lens = np.zeros((batch_size), dtype=np.int32) enc_padding_mask = np.zeros((batch_size, max_enc_seq_len), dtype=np.float32) # Fill in the numpy arrays for i, ex in enumerate(example_list): enc_batch[i, :] = ex.enc_input[:] enc_lens[i] = ex.enc_len for j in range(ex.enc_len): enc_padding_mask[i][j] = 1 # For pointer-generator mode, need to store some extra info if config.pointer_gen: # Determine the max number of in-article OOVs in this batch max_art_oovs = max([len(ex.article_oovs) for ex in example_list]) # Store the in-article OOVs themselves art_oovs = [ex.article_oovs for ex in example_list] # Store the version of the enc_batch that uses the article OOV ids enc_batch_extend_vocab = np.zeros((batch_size, max_enc_seq_len), dtype=np.int32) for i, ex in enumerate(example_list): enc_batch_extend_vocab[i, :] = ex.enc_input_extend_vocab[:] ###################### init decoder seq ###################### # Pad the inputs and targets for ex in example_list: ex.pad_decoder_inp_targ(config.max_dec_steps, pad_id) # Initialize the numpy arrays. dec_batch = np.zeros((batch_size, config.max_dec_steps), dtype=np.int32) target_batch = np.zeros((batch_size, config.max_dec_steps), dtype=np.int32) dec_padding_mask = np.zeros((batch_size, config.max_dec_steps), dtype=np.float32) dec_lens = np.zeros((batch_size), dtype=np.int32) # Fill in the numpy arrays for i, ex in enumerate(example_list): dec_batch[i, :] = ex.dec_input[:] target_batch[i, :] = ex.target[:] dec_lens[i] = ex.dec_len for j in range(ex.dec_len): dec_padding_mask[i][j] = 1 # ------------------------------------------------------------------------------- # # ---------------- get_input_from_batch , get_output_from_batch ----------------- # # ------------------------------------------------------------------------------- # # get_input_from_batch enc_batch = Variable(torch.from_numpy(enc_batch).long()) enc_padding_mask = Variable(torch.from_numpy(enc_padding_mask)).float() if config.pointer_gen: enc_batch_extend_vocab = Variable( torch.from_numpy(enc_batch_extend_vocab).long()) # max_art_oovs is the max over all the article oov list in the batch if max_art_oovs > 0: extra_zeros = Variable(torch.zeros((batch_size, max_art_oovs))) c_t_1 = Variable(torch.zeros((batch_size, 2 * config.hidden_dim))) coverage = None if config.is_coverage: coverage = Variable(torch.zeros(enc_batch.size())) if use_cuda: enc_batch = enc_batch.cuda() enc_padding_mask = enc_padding_mask.cuda() if enc_batch_extend_vocab is not None: enc_batch_extend_vocab = enc_batch_extend_vocab.cuda() if extra_zeros is not None: extra_zeros = extra_zeros.cuda() c_t_1 = c_t_1.cuda() if coverage is not None: coverage = coverage.cuda() # get_output_from_batch dec_batch = Variable(torch.from_numpy(dec_batch).long()) dec_padding_mask = Variable(torch.from_numpy(dec_padding_mask)).float() max_dec_len = np.max(dec_lens) dec_lens_var = Variable(torch.from_numpy(dec_lens)).float() target_batch = Variable(torch.from_numpy(target_batch)).long() if use_cuda: dec_batch = dec_batch.cuda() dec_padding_mask = dec_padding_mask.cuda() dec_lens_var = dec_lens_var.cuda() target_batch = target_batch.cuda() return (enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage), (dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch)
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article # đây là một văn bản ban đầu article = article.decode('utf-8') article_words = article.split() #cắt thành các câu specific_cnn = ['-lrb-', 'cnn', '-rrb-', '-lsb-', '-rsb-', '--'] sentences = sent_tokenize(article) src_words = [] for sen in sentences: word = word_tokenize(sen) #-lrb- cnn -rrb- word = [i for i in word if i not in specific_cnn] src_words.append(word) # print(src_words) self.enc_len_origin = len(src_words) self.enc_nsents = len(sentences) #tạo ra input cho bert self.enc_subtoken_ids, self.enc_segments_ids, self.enc_cls_ids, self.enc_sep_ids, self.enc_len_cls, self.enc_len_sep = bert_pro.preprocess( src_words) self.enc_len = len( self.enc_subtoken_ids ) # store the length after truncation but before padding # article_words = config.tokenizer.tokenize(article) # # print(article_words) # if len(article_words) > config.max_enc_steps: # article_words = article_words[:config.max_enc_steps] # self.enc_len = len(article_words) # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string # print("A new abstract \n",abstract) abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences