Esempio n. 1
0
    def __init__(self, article, abstract_sentences, vocab, hps):
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        article_words = article.split()
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]
        self.enc_len = len(article_words)
        self.enc_input = [vocab.word2id(w) for w in article_words]

        abstract = ' '.join(abstract_sentences)
        abstract_words = abstract.split()
        abs_ids = [vocab.word2id(w) for w in abstract_words]
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
            article_words, vocab)
        abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                 self.article_oovs)

        _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab,
                                                    hps.max_dec_steps,
                                                    start_decoding,
                                                    stop_decoding)

        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 2
0
    def __init__(self, article, abstract_sentences, vocab, concept_vocab):
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        article = ' '.join(article)
        article_words = article.split()
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(article_words)
        self.enc_input = [vocab.word2id(w) for w in article_words]

        abstract = ' '.join(abstract_sentences)
        abstract_words = abstract.split()
        abs_ids = [vocab.word2id(w) for w in abstract_words]

        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        if config.pointer_gen:
            self.enc_input_extend_vocab, self.article_oovs, self.enc_input_concept_extend_vocab, self.concept_p, self.position, self.concept_mask = data.article2ids(
                article_words, vocab, concept_vocab)
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                stop_decoding)

        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 3
0
    def __init__(self, article, ner_path, abstract_sentences, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # Process importance
        self.word_id_to_imp = data.get_word_id_to_importance(ner_path, vocab)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 4
0
    def __init__(self, article, abstract_sentences,pos,  vocab_in,vocab_out, config,types = None):
        self._config = config

        # Get ids of special tokens
        start_decoding = vocab_in.word2id(data.START_DECODING)
        stop_decoding = vocab_in.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(article_words)  # store the length after truncation but before padding
        self.enc_input = [vocab_in.word2id(w) for w in
                          article_words]  # list of word ids; OOVs are represented by the id for UNK token


        if config.use_pos_tag:
            if pos==None:
                self.decode_pos, self.target_pos = [],[]
                self.enc_pos = []
            else:
                pos_words = pos.split()
                if len(pos_words) > config.max_enc_steps:
                    pos_words = pos_words[:config.max_enc_steps]
                assert len(pos_words)==len(article_words)
                #self.enc_pos = [vocab_in.tag2id[w] for w in pos_words]
                self.enc_pos = [vocab_out.vocab_tag.word2id(w) for w in pos_words]
                self.decode_pos, self.target_pos = self.get_dec_inp_targ_seqs(self.enc_pos, config.max_dec_steps,start_decoding, stop_decoding)


        if config.types:
            self.types = types

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [vocab_out.word2id(w) for w in abstract_words]  # list of word ids; OOVs are represented by the id for UNK token



        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding,stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab_out)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab_out, self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding,stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 5
0
    def __init__(self, article, abstract_sentences, vocab):
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        #处理article,如果超过配置文件中的长度,截断
        article_words = article.split()
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        #编码article, 包括oov单词也得跟着编码
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        #编码abstract
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        #构造编码阶段的输入序列和输出序列
        self.dec_input, _ = self.get_dec_inp_targ_seqs(abs_ids,
                                                       config.max_dec_steps,
                                                       start_decoding,
                                                       stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
        # 编码时需要输入原文编码和oov单词的编码
        self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
            article_words, vocab)

        # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
        # 获取参考摘要的id, 其中oov单词由原文中的oov单词编码表示
        abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                 self.article_oovs)

        # Get decoder target sequence
        # 目标编码和处理oov
        _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab,
                                                    config.max_dec_steps,
                                                    start_decoding,
                                                    stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 6
0
  def __init__(self, title, article, tags, abstract_sentences, abstract_sentences_all, vocab, hps, stop_words):
    """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
    self.hps = hps

    # Get ids of special tokens
    start_decoding = vocab.word2id(data.START_DECODING)
    stop_decoding = vocab.word2id(data.STOP_DECODING)

    # Process the article
    article_words = article.split()
    if len(article_words) > hps.max_enc_steps:
      article_words = article_words[:hps.max_enc_steps]
    self.enc_len = len(article_words) # store the length after truncation but before padding
    self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

    # Process the abstract
    abstract = ' '.join(abstract_sentences) # string
    abstract_words = abstract.split() # list of strings
    abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token

    # Get the decoder input sequence and target sequence
    self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
    self.dec_len = len(self.dec_input)

    if hps.title_engaged or hps.title_guided:
      title_words = title.split()
      self.title_input = [vocab.word2id(w) for w in title_words[:hps.max_title_len]]
      self.title_len = len(self.title_input)

    # If using pointer-generator mode, we need to store some extra info
    if hps.pointer_gen:
      # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
      self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)
      if hps.co_occurrence or hps.prev_relation or hps.co_occurrence_h or hps.co_occurrence_i or (hps.coverage and hps.coverage_weighted) or hps.attention_weighted or hps.markov_attention or hps.markov_attention_contribution:
        self.cooccurrence_matrix, self.cooccurrence_weight = data.get_cooccurrence_matrix(self.enc_input_extend_vocab, win_size=hps.occurrence_window_size, exclude_words=stop_words, need_weight=(hps.co_occurrence_i or (hps.coverage and hps.coverage_weighted) or hps.attention_weighted or hps.markov_attention or hps.markov_attention_contribution), top_ten_kept=hps.top_ten_kept)
      # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
      abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

      # Overwrite decoder target sequence so it uses the temp article OOV ids
      _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding)

    if tags is not None:
      self.tags = tags[:self.enc_len]

    # Store the original strings
    self.original_article = article
    self.original_abstract = abstract
    self.original_abstract_sents = abstract_sentences
    self.original_abstract_sents_all = abstract_sentences_all
Esempio n. 7
0
    def __init__(self, article, abstract_sentences, vocab):
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article = article.decode('utf-8')
        article_words = article.split()
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        twk = textrank.TextRankKeyword()
        twk.analyze(article_words, window_size=4, lower=False)
        self.word_rank = twk.makeword_rank()

        self.word_rank_data = []
        for w in article_words:
            self.word_rank_data.append(self.word_rank[w])

            # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 8
0
  def __init__(self, content_text, field_text, summary_sentences, vocab, hps):
    """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      content_text,source_text: source text; a string. each token is separated by a single space.
      summary_sentences: list of strings, one per summary sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
    self.hps = hps

    # Get ids of special tokens
    start_decoding = vocab.word2id(data.START_DECODING,field=False)
    stop_decoding = vocab.word2id(data.STOP_DECODING,field=False)
    
    # Process the content_text
    content_words = content_text.split()
    if len(content_words) > hps.max_enc_steps:
      content_words = content_words[:hps.max_enc_steps]
    self.enc_len_content = len(content_words) # store the length after truncation but before padding
    self.enc_input_content = [vocab.word2id(w,field=False) for w in content_words] # list of word ids; OOVs are represented by the id for UNK token

    # Process the field_text
    field_words = field_text.split()
    if len(field_words) > hps.max_enc_steps:
      field_words = field_words[:hps.max_enc_steps]
    self.enc_len_field = len(content_words) # store the length after truncation but before padding
    self.enc_input_field = [vocab.word2id(w,field=True) for w in field_words] # list of word ids; OOVs are represented by the id for UNK token

    assert self.enc_len_field == self.enc_len_content
    # Process the summary@@@@@@@@@
    summary_text = ' '.join(summary_sentences) # string
    summary_words = summary_text.split() # list of strings
    summary_ids = [vocab.word2id(w,field = False) for w in summary_words] # list of word ids; OOVs are represented by the id for UNK token

    # Get the decoder input sequence and target sequence
    self.dec_input, self.target = self.get_dec_inp_targ_seqs(summary_ids, hps.max_dec_steps, start_decoding, stop_decoding)
    self.dec_len = len(self.dec_input)
    
    # If using pointer-generator mode, we need to store some extra info
    if hps.pointer_gen:
      # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
      self.enc_input_extend_vocab, self.article_oovs = data.article2ids(content_words, vocab)
      
      # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
      sum_ids_extend_vocab = data.abstract2ids(summary_words, vocab, self.article_oovs)

      # Overwrite decoder target sequence so it uses the temp article OOV ids
      _, self.target = self.get_dec_inp_targ_seqs(sum_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding)
    
    # Store the original strings
    self.original_content = content_text
    self.original_field = field_text
    self.original_summary = summary_text
    self.original_summary_sents = summary_sentences
Esempio n. 9
0
    def __init__(self, article, abstract_sens, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences.

        Args:
            article: source text; a string. each token is separated by a single space.
            abstract_sens: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
            vocab: Vocabulary object
            hps: hyperparameters
        """

        self.__hps = hps

        start_id = vocab.w2i(data.START_TOKEN)
        stop_id = vocab.w2i(data.STOP_TOKEN)

        # article
        article_words = article.split()

        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]

        self.enc_len = len(article_words)
        self.enc_input = [vocab.w2i(w) for w in article_words]

        # abstract
        abstract = ' '.join(abstract_sens)
        abstract_words = abstract.split()

        if len(abstract_words) > hps.max_dec_steps:
            abstract_words = abstract_words[:hps.max_dec_steps]

        # pointer generator

        ## Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id and the in-article OOVs words
        self.enc_input_ext_vocab, self.article_oovs = data.article2ids(
            article_words, vocab)

        ## Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
        abs_ids = [vocab.w2i(w) for w in abstract_words]

        self.dec_input, self.dec_target = self.__get_dec_input_target_seqs(
            abs_ids, hps.max_dec_steps, start_id, stop_id)
        self.dec_len = len(self.dec_input)

        abs_ids_ext_vocab = data.abstract2ids(abstract_words, vocab,
                                              self.article_oovs)
        _, self.ext_dec_target = self.__get_dec_input_target_seqs(
            abs_ids_ext_vocab, hps.max_dec_steps, start_id, stop_id)

        # origin backup
        self.origin_article = article
        self.origin_abstract = abstract
        self.origin_abstract_sens = abstract_sens
Esempio n. 10
0
    def __init__(self, paragraph, question, answer, answer_positions, 
                 vocab, max_enc_steps, max_dec_steps, dynamic_vocab=False):
        
        self.dynamic_vocab = dynamic_vocab
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        paragraph_words = word_tokenize(paragraph)
        question_words = word_tokenize(question)
        answer_start_idx, answer_end_idx = answer_positions
        #assert ' '.join(paragraph_words[answer_start_idx:answer_end_idx]) == answer
        
        # Process the paragraph
        if len(paragraph_words) > max_enc_steps:
            if answer_end_idx <= max_enc_steps:
                paragraph_words = paragraph_words[:max_enc_steps]
            else:
                answer_mid_idx = (answer_start_idx + answer_end_idx) // 2
                # assume len(answer_words) <= len(paragraph_words)
                paragraph_trunc_end = min(answer_mid_idx + max_enc_steps//2, len(paragraph_words))
                paragraph_trunc_start = paragraph_trunc_end - max_enc_steps + 1
                assert (paragraph_trunc_start <= answer_start_idx) and (paragraph_trunc_end >= answer_end_idx) 
                paragraph_words = paragraph_words[paragraph_trunc_start:paragraph_trunc_end]
                answer_start_idx -= paragraph_trunc_start
                answer_end_idx -= paragraph_trunc_start
        self.enc_len = len(paragraph_words) # store the length after truncation but before padding
        self.enc_input = [vocab.word2id(w) for w in paragraph_words] # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        question_ids = [vocab.word2id(w) for w in question_words] # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(question_ids, max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if self.dynamic_vocab:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.enc_oovs = data.article2ids(paragraph_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            question_ids_extend_vocab = data.abstract2ids(question_words, vocab, self.enc_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            self.dec_input_extend_vocab, self.target = self.get_dec_inp_targ_seqs(question_ids_extend_vocab, max_dec_steps, start_decoding, stop_decoding)

        # Store the original strings
        self.original_paragraph = paragraph
        self.original_question = question
        self.original_answer = answer #' '.join(paragraph_words[answer_start_idx:answer_end_idx])
        self.answer_start_idx = answer_start_idx
        self.answer_end_idx = answer_end_idx
Esempio n. 11
0
    def __init__(self, article, abstract_sentences, vocab):
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)  # START_DECODING = '[START]'
        stop_decoding = vocab.word2id(data.STOP_DECODING)  # STOP_DECODING = '[STOP]'

        # Process the article
        article_words = article.split()
        if len(article_words) > config.max_enc_steps:  # max_enc_steps=400
            article_words = article_words[:config.max_enc_steps]

        # store the length after truncation but before padding
        self.enc_len = len(article_words)

        # list of word ids; OOVs are represented by the id for UNK token
        self.enc_input = [vocab.word2id(w) for w in article_words]  # word --> id

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [vocab.word2id(w) for w in
                   abstract_words]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding,
                                                                 stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:  # pointer_gen=True
            """
            Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; 
            also store the in-article OOVs words themselves
            """
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)

            """
            Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            """
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                                                        stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
    def __init__(self, content, query, summary, vocab):
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        content_words = content.split()
        query_words = query.split()
        summary_words = summary.split()
        if len(content_words) > config.max_enc_steps:
            content_words = content_words[:config.max_enc_steps]
        self.enc_len = len(
            content_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in content_words
        ]  # list of word ids; OOVs are represented by the id for UNK token
        self.query_enc_input = [vocab.word2id(w) for w in query_words]
        # Process the abstract

        summary_ids = [
            vocab.word2id(w) for w in summary_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            summary_ids, config.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.content_oovs = data.article2ids(
                content_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(summary_words, vocab,
                                                     self.content_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_content = content
        self.original_query = query
        self.original_summary = summary
Esempio n. 13
0
    def __init__(self, article, abstract_sentence, vocab, config):
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.BOS_WORD)
        stop_decoding = vocab.word2id(data.EOS_WORD)

        # Process the article
        # if article == 'nan':
        #     article_words = ['']
        # else:
        article_words = article.split()
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract_words = abstract_sentence.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_seqs(
            abs_ids, config.max_dec_steps, start_decoding, stop_decoding)

        # If using pointer-generator mode, we need to store some extra info
        if config.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_seqs(abs_ids_extend_vocab,
                                                   config.max_dec_steps,
                                                   start_decoding,
                                                   stop_decoding)

        # Store the original strings
        # self.original_article = article
        self.original_abstract = abstract_sentence
Esempio n. 14
0
    def __init__(self, article, abstract_sentences, vocab):
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # 处理article,如果超过配置文件中的长度,截断。
        article_words = list(
            article
        )  #*******************************************************************中文分词
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        # 编码 article,包括oov单词也得跟着编码
        self.enc_input = [vocab.word2id(w) for w in article_words]
        # 处理 abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = list(abstract)  # list of strings
        # 编码 abstract
        abs_ids = [vocab.word2id(w) for w in abstract_words]  #

        # 构建解码阶段的输入序列和输出序列“strat w1 w2”, "w1 w2 end",要一样长
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # 如果使用pointer-generator模式, 需要一些额外信息
        if config.pointer_gen:
            # 编码时需要输入原文编码和oov单词的编码
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # 获取参考摘要的id,其中oov单词由原文中的oov单词编码表示
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # 目标编码和处理oov
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                stop_decoding)

        # 存储原始数据
        self.original_article = article
        self.original_abstract = abstract
        # 编码前的摘要,单词列表
        self.original_abstract_sents = abstract_sentences
Esempio n. 15
0
  def __init__(self, article, abstract_sentences, vocab, hps):
    """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
    self.hps = hps

    # Get ids of special tokens
    start_decoding = vocab.word2id(data.START_DECODING)
    stop_decoding = vocab.word2id(data.STOP_DECODING)

    # Process the article
    article_words = article.split()
    if len(article_words) > hps.max_enc_steps:
      article_words = article_words[:hps.max_enc_steps]
    self.enc_len = len(article_words) # store the length after truncation but before padding
    self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

    # Process the abstract
    abstract = ' '.join(abstract_sentences) # string
    abstract_words = abstract.split() # list of strings
    abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token

    # Get the decoder input sequence and target sequence
    self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
    self.dec_len = len(self.dec_input)

    # If using pointer-generator mode, we need to store some extra info
    if hps.pointer_gen:
      # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
      self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)

      # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
      abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

      # Overwrite decoder target sequence so it uses the temp article OOV ids
      _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding)

    # Store the original strings
    self.original_article = article
    self.original_abstract = abstract
    self.original_abstract_sents = abstract_sentences
Esempio n. 16
0
    def __init__(self, article, abstract, vocab, hps):
        """
    """
        self.hps = hps

        start_id = vocab._word2id(data.DECODING_START)
        end_id = vocab._word2id(data.DECODING_END)

        article_words = article.split()
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]
        self.enc_len = len(article_words)
        self.enc_input = [vocab._word2id(w) for w in article_words]

        abstract_words = abstract.split()
        abs_ids = [vocab._word2id(w) for w in abstract_words]

        self.dec_input = [start_id] + abs_ids
        self.dec_target = abs_ids + [end_id]

        self.pad_id = vocab._word2id(data.PAD_TOKEN)
        if hps.pointer:
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)
            self.dec_target = abs_ids_extend_vocab + [end_id]
            while len(self.enc_input_extend_vocab) < hps.max_enc_steps:
                self.enc_input_extend_vocab.append(self.pad_id)

        while len(self.enc_input) < hps.max_enc_steps:
            self.enc_input.append(self.pad_id)

        while len(self.dec_input) < hps.max_dec_steps:
            self.dec_input.append(self.pad_id)
            self.dec_target.append(self.pad_id)

        if len(self.dec_input) > hps.max_dec_steps:
            self.dec_input = self.dec_input[:hps.max_dec_steps - 1]
            self.dec_target = self.dec_target[:hps.max_dec_steps - 1]

        self.dec_len = len(self.dec_input)

        self.original_article = article
        self.original_abstract = abstract
    def __init__(self, article, abstract, vocab):
        # 开始、结束 ID
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        article_words = article.split()
        if len(article_words) > config.max_enc_steps:
            article_words = article_words[:config.max_enc_steps]
        self.enc_len = len(article_words)
        # 编码 article,不在vocab的用UNK序号表示,拿这个做输入
        self.enc_input = [vocab.word2id(w) for w in article_words]
        # 处理 abstract,不在vocab的用UNK序号表示
        abstract_words = abstract.split()
        abs_ids = [vocab.word2id(w) for w in abstract_words]

        # 解码阶段的输入序列和输出序列
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # 如果使用pointer-generator模式,enc_input中unk对应的0序号,会替换为词汇表长+oov词汇表内位置的序号
        if config.pointer_gen:
            # 编码输入扩展了oov词的序列(unk的有了序号)和oov单词
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # 获取参考摘要的id,其中oov单词由原文中的oov单词编码表示
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # 新的目标序列,unk词有序号
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, config.max_dec_steps, start_decoding,
                stop_decoding)

        # 存储原始数据
        self.original_article = article
        self.original_abstract = abstract
        # 编码前的摘要,单词列表
        self.original_abstract_sents = abstract
Esempio n. 18
0
  def __init__(self, article, abstract_sentences, vocab):
    # Get ids of special tokens
    start_decoding = vocab.word2id(data.START_DECODING)
    stop_decoding = vocab.word2id(data.STOP_DECODING)

    # Process the article
    article_words = article.split()
    if len(article_words) > config.max_enc_steps:
      article_words = article_words[:config.max_enc_steps]
    self.enc_len = len(article_words) # store the length after truncation but before padding
    self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

    # Process the abstract
    abstract = ' '.join(abstract_sentences) # string
    abstract_words = abstract.split() # list of strings
    abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token

    # Get the decoder input sequence and target sequence
    self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding)
    self.dec_len = len(self.dec_input)

    # If using pointer-generator mode, we need to store some extra info
    if config.pointer_gen:
      # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
      self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)

      # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
      abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

      # Overwrite decoder target sequence so it uses the temp article OOV ids
      _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding)

    # Store the original strings
    self.original_article = article
    self.original_abstract = abstract
    self.original_abstract_sents = abstract_sentences
Esempio n. 19
0
    def __init__(self, article, abstract_sentences, vocab, hps, log_path):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        if log_path is not None:
            # For testing
            reference_cluster_dir = os.path.join(log_path, "reference")
            if not os.path.exists(reference_cluster_dir):
                os.makedirs(reference_cluster_dir)

            # Process the abstract
            abstract = ' '.join(abstract_sentences)  # string
            abstract = run_coreference_resolution_for_testing(
                abstract, reference_cluster_dir)
            abstract_words = abstract.split()  # list of strings
            abs_ids = [
                vocab.word2id(w) for w in abstract_words
            ]  # list of word ids; OOVs are represented by the id for UNK token
        else:
            # Process the abstract
            abstract = ' '.join(abstract_sentences)  # string
            abstract = run_coreference_resolution_for_training(abstract)
            abstract_words = abstract.split()  # list of strings
            abs_ids = [
                vocab.word2id(w) for w in abstract_words
            ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            if hps.coreference_resolution:
                indices = []
                antecedent = get_antecedent(abstract)
                if antecedent != None:
                    for key in antecedent:
                        positions = [
                            i for i, j in enumerate(abstract_words) if j == key
                        ]
                        if positions:
                            positions = np.asarray(positions)
                            closest_index = positions[(
                                np.abs(positions - antecedent[key])).argmin()]
                            indices.append(closest_index)

                idx = 0
                if indices:
                    for i in range(len(abs_ids_extend_vocab)):
                        if i == indices[idx]:
                            if idx < len(indices) - 1:
                                idx += 1
                            else:
                                continue
                        else:
                            abs_ids_extend_vocab[i] = 0
                else:
                    # Overwrite decoder target sequence so it uses the temp article OOV ids
                    _, self.target = self.get_dec_inp_targ_seqs(
                        abs_ids_extend_vocab, hps.max_dec_steps,
                        start_decoding, stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 20
0
    def __init__(self, article_sentences, extract_ids, abstract_sentences,
                 vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
        self.hps = hps

        # Store the original strings
        self.original_article_sents = article_sentences
        self.original_extract_ids = extract_ids
        self.original_abstract_sents = abstract_sentences

        if hps.model in ['rewriter', 'end2end']:
            # Get ids of special tokens
            start_decoding = vocab.word2id(data.START_DECODING)
            stop_decoding = vocab.word2id(data.STOP_DECODING)

            if hps.model == 'rewriter':
                # Process the extracted sentences
                extract_sentences = [
                    article_sentences[idx] for idx in extract_ids
                ]
                enc_input_words = ' '.join(extract_sentences).split()
            else:
                # Process the article sentences
                enc_input_words = ' '.join(article_sentences).split()
                self.enc_input_sent_ids = []
                for idx, sent in enumerate(article_sentences):
                    sent_words = sent.split()
                    for _ in range(len(sent_words)):
                        if len(self.enc_input_sent_ids) < hps.max_enc_steps:
                            self.art_len = idx + 1
                            self.enc_input_sent_ids.append(idx)

            if len(enc_input_words) > hps.max_enc_steps:
                enc_input_words = enc_input_words[:hps.max_enc_steps]
            self.enc_len = len(
                enc_input_words
            )  # store the length after truncation but before padding
            self.enc_input = [
                vocab.word2id(w) for w in enc_input_words
            ]  # list of word ids; OOVs are represented by the id for UNK token

            # Process the abstract
            abstract = ' '.join(abstract_sentences)  # string
            abstract_words = abstract.split()  # list of strings
            abs_ids = [
                vocab.word2id(w) for w in abstract_words
            ]  # list of word ids; OOVs are represented by the id for UNK token

            # Get the decoder input sequence and target sequence
            self.dec_input, self.target = self.get_dec_inp_targ_seqs(
                abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
            self.dec_len = len(self.dec_input)

            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                enc_input_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        if hps.model in ['selector', 'end2end']:
            # Process the article
            if hps.model == 'selector':
                if len(article_sentences) > hps.max_art_len:
                    article_sentences = article_sentences[:hps.max_art_len]
                self.art_len = len(
                    article_sentences
                )  # store the length after truncation but before padding
            elif hps.model == 'end2end':
                if self.art_len > hps.max_art_len:
                    self.art_len = hps.max_art_len
                article_sentences = article_sentences[:self.art_len]

            self.art_ids = []
            self.sent_lens = []
            for sent in article_sentences:
                sent = sent.split()
                if len(sent) > hps.max_sent_len:
                    sent = sent[:hps.max_sent_len]
                self.sent_lens.append(len(sent))
                self.art_ids.append([vocab.word2id(w) for w in sent])
	def __init__(self, article, abstract_sentences, vocab, hps, word_edge_list=None, query=None, query_edge_list=None, epoch_num=None, bert_vocab=None):
		"""Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

		Args:
			article: source text; a string. each token is separated by a single space.
			abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
			vocab: Vocabulary object
			hps: hyperparameters
		"""
		self.hps = hps
		# Get ids of special tokens
		start_decoding = vocab.word2id(data.START_DECODING)
		stop_decoding = vocab.word2id(data.STOP_DECODING)
		self.bert_vocab = bert_vocab
		self.epoch_num = epoch_num #deprecated
		self.enc_pos_offset = None
		self.query_pos_offset = None
		# Process the article
		article_words = article.split()
		if len(article_words) > hps.max_enc_steps.value:
			article_words = article_words[:hps.max_enc_steps.value]
		self.enc_len = len(article_words)  # store the length after truncation but before padding
		self.enc_input = [vocab.word2id(w) for w in
						  article_words]  # list of word ids; OOVs are represented by the id for UNK token
		#tf.logging.info(self.enc_len)
		if self.hps.use_elmo.value:
			self.enc_input_raw = article_words 
		# Process the abstract
		abstract = ' '.join(abstract_sentences)  # string
		abstract_words = abstract.split()  # list of strings
		abs_ids = [vocab.word2id(w) for w in
				   abstract_words]  # list of word ids; OOVs are represented by the id for UNK token

		# Process the query 
		if hps.query_encoder.value:
			query_words = query.split()
			#query_words = word_features.get_tokens(query)
			if len(query_words) > hps.max_query_steps.value:
				#tf.logging.info('Before_query: %d Hps: %d'%(len(query_words),hps.max_query_steps.value))
				query_words = query_words[len(query_words)- hps.max_query_steps.value:]
				#tf.logging.info('Big_query : %d'%(len(query_words)))
				query = " ".join(q for q in query_words)
			self.query_len = len(query_words) # store the length after truncation but before padding
			
			self.query_input = [vocab.word2id(w) for w in query_words] # list of word ids; OOVs are represented by the id for UNK token
			if self.hps.use_query_elmo.value:
				self.query_input_raw = query_words #tensorflow_hub requires raw text
				
		# Get the decoder input sequence and target sequence
		self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps.value, start_decoding,
																 stop_decoding)
		self.dec_len = len(self.dec_input)

		# If using pointer-generator mode, we need to store some extra info
		if hps.pointer_gen.value:
			# Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
			self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)

			# Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
			abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

			# Overwrite decoder target sequence so it uses the temp article OOV ids
			_, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps.value, start_decoding,
														stop_decoding)

		if hps.word_gcn.value:
			self.word_edge_list = word_edge_list

		if hps.query_gcn.value:
			self.query_edge_list = query_edge_list

		if hps.use_bert.value:
			self.enc_input, self.enc_pos_offset = bert_vocab.convert_glove_to_bert_indices(self.enc_input)	
			self.enc_len = len(self.enc_input)
			if hps.use_query_bert.value:
				self.query_input, self.query_pos_offset = bert_vocab.convert_glove_to_bert_indices(self.query_input)	
 				self.query_len = len(self.query_input)
		
		# Store the original strings
		self.original_article = article
		self.original_abstract = abstract
		self.original_abstract_sents = abstract_sentences
		#if hps.query_encoder:
		self.original_query = query
Esempio n. 22
0
    def __init__(self, article, abstract, vocab, hps):
        """
        Initializes the Example, performing tokenization and truncation to produce the encoder,
        decoder and target sequences, which are stored in self.
    
        Args:
            article: source text; a string. each token is separated by a single space.
            abstract: reference summary; a string. each token is separated by a single space.
            vocab: Vocabulary object
            hps: hyperparameters
        """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING, None)
        stop_decoding = vocab.word2id(data.STOP_DECODING, None)

        # Process the article
        article_words = [data.parse_word(word) for word in article.split()]
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]

        # Store the length after truncation but before padding
        self.enc_len = len(article_words)
        # List of word ids; OOVs and entities are represented by ids less than data.N_FREE_TOKENS
        self.enc_input = [
            vocab.word2id(w, word_type) for w, word_type in article_words
        ]

        # Process the abstract
        abstract_words = [data.parse_word(word) for word in abstract.split()]
        # List of word ids; OOVs and entities are represented by ids less than data.N_FREE_TOKENS
        abs_ids = [
            vocab.word2id(w, word_type) for w, word_type in abstract_words
        ]

        # Get the decoder input sequence and target sequence with non-article specific ids.
        self.dec_input, target_orig = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)
        # Store a version of the enc_input where in-article OOVs and entities are represented by
        # their temporary OOV id. Also store the in-article OOVs words themselves and a mapping
        # from temporary OOV ids to vocab ids.
        self.enc_input_extend_vocab, self.article_oovs, self.article_id_to_word_id = (
            data.article2ids(article_words, vocab, hps.copy_only_entities))

        # Get set of words that can be copied.
        if hps.copy_only_entities:
            # article_oovs only has entities
            copyable_words = set(self.article_oovs)
        else:
            copyable_words = set([w for w, word_type in article_words])

        # Get a version of the reference summary where in-article OOVs are represented by their
        # temporary article OOV id
        abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                 self.article_oovs,
                                                 copyable_words,
                                                 hps.output_vocab_size)
        # Set decoder target sequence that uses the temp article OOV ids
        _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab,
                                                    hps.max_dec_steps,
                                                    start_decoding,
                                                    stop_decoding)

        # Compute a mask for which tokens are people.
        people_tokens = {
            vocab.word2id('', token)
            for token in data.PERSON_TOKENS
        }
        self.target_people = [
            float(token in people_tokens) for token in target_orig
        ]

        # Get list of people ids
        self.people_ids = []
        for article_id, word_id in self.article_id_to_word_id.iteritems():
            if word_id in people_tokens:
                self.people_ids.append(article_id)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
Esempio n. 23
0
    def __init__(self, article, abstract_sentences, article_id, sections, section_names, labels, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

        Args:
          article: source text; a list of strings. each token is separated by a single space.
          abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
          article_id: string
          sections: list of list of strings
          section_names: list of strings
          labels: list of strings, for extractive summarization training (TODO Later)
          vocab: Vocabulary object
          hps: hyperparameters
        """
        self.hps = hps
        self.discard = False
        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)


        # clean section information
        # clean sections after conclusions
        if hps.hier:
          end_loc = len(section_names)
          beg_loc = 0
          for i,s in enumerate(section_names):
            if 'conclu' in s.lower():
              end_loc = i + 1
            if 'intro' in s.lower() and beg_loc == 0:
              beg_loc = i
              
          if beg_loc < len(section_names) - end_loc:
            sections = sections[beg_loc:end_loc]
          try:
            intro_last = sections[beg_loc][-2:] # last two sentences in the intro
          except IndexError:
#             print('article_id: {}, len(sections): {}, section_names: {}'.format(article_id, len(sections), section_names))
            self.discard = True
            return
#           intro_first = []
          i = 0
#           intro_last_len = _count_words(intro_last)
#           intro_len = intro_last_len
#           while(intro_len < hps.max_intro_len):
#             intro_first.append(sections[beg_loc][i])
#             intro_len = _count_words(intro_first) + intro_last_len
#             i += 1
          
          if not hps.split_intro:
          
            max_sents = hps.max_intro_sents - 2 # exclude the last two sents
            intro_first = sections[beg_loc][:max_sents]
            intro_last_words = _get_section_words(intro_last, pad=False)
            intro_last_len = len(intro_last_words) # flatten list of sents, get the string inside, count words
            
            discard_last = False
            if intro_last_len > hps.max_intro_len:
              discard_last = True
            len_limit = hps.max_intro_len - intro_last_len if not discard_last else hps.max_intro_len
            # truncate the intro by len_limit (we consider last 2 sentences from the intro to be there always)
            # Flatten list of lists, get the first element (string), get words, get first n words, return a striing, make it a list, extend it with intro_last
            intro_words = _get_section_words(intro_first, len_limit, pad=False)
            
            try:
              if intro_words[-1] != '.':
                intro_words = intro_words[:-1] + ['.']
                if not discard_last:
                  intro_words += intro_last_words
                intro_words = _pad_words(intro_words, hps.max_intro_len)
            except IndexError:
              print('No first section, Example discarded: ', article_id)
              self.discard = True
          
          else:    
            intro_first = sections[beg_loc][:hps.max_intro_sents]
            intro_words = _get_section_words(intro_first, hps.max_intro_len, pad=True)

          try:
            conclusion_words = _get_section_words(sections[end_loc - beg_loc - 1][:hps.max_conclusion_sents], hps.max_conclusion_len)
          except:
            import pdb; pdb.set_trace()
            print("ERROR, pause and check")
            print('end_loc:', end_loc)
            print('section_names:', section_names)
            print('num_sections: {}'.format(len(sections)))
            print('len_sections_sents:', [len(e) for e in sections])
            
#           if not hps.intro_split:
          article_sections = [_get_section_words(s[:hps.max_section_sents], hps.max_section_len)
                              for s in sections[1:-1][:hps.num_sections - 2]]
#           else:
#             tmp_sections = []
#             remaining_sec = sections[1:-1]
#             if len(remaining_sec) > hps.num_sections - 2:
#               for i in range(hps.num_sections - 2):
#                 tmp_sections.append(remaining_sec[i])
#               last_sec = []
#               while(i < len(remaining_sec)):
#                 last_sec.extend(remaining_sec[i])
#                 i += 1
#               tmp_sections.append(last_sec)
#               remaining_sec = tmp_sections
#   
#             article_sections = [_get_section_words(s, hps.max_section_len)
#                                 for s in remaining_sec]
          
          sections = [intro_words] + article_sections + [conclusion_words]
          sec_len = len(sections)
          self.sec_len = sec_len
          self.num_words_section = [hps.max_section_len for e in sections] 
          self.num_words_section_nopad = [len(e) for e in sections]
          # TODO: Assumption is that sections is a list of list (sections, sentences), check if assumption is true
          # TODO: Assumtpion is that number of sections is greater than 2, check if assumption is true
          
#           pad_id = vocab.word2id(data.PAD_TOKEN)
          
          
            
        

        article_text = ' '.join(article)
        # Process the article
        article_words = article_text.split()
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]
        # store the length after truncation but before padding
        self.enc_len = len(article_words)
        # list of word ids; OOVs are represented by the id for UNK token
        self.enc_input = [vocab.word2id(w) for w in article_words]
        
        if hps.hier:
          self.enc_sections = []
          
          for sec in sections:
            self.enc_sections.append([vocab.word2id(w) for w in sec])
          self.enc_sec_len = [len(e) for e in self.enc_sections]
#           self.enc_sec_len = sec_len # TODO: Check

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        # list of word ids; OOVs are represented by the id for UNK token
        abs_ids = [vocab.word2id(w) for w in abstract_words]

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are
            # represented by their temporary OOV id; also store the in-article
            # OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are
            # represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(
                abstract_words, vocab, self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV
            # ids, the target now includes words that are in the article but
            # not in the abstract, so represented as OOV
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding)

        self.article_id = article_id
        self.sections = sections
        self.section_names = section_names
        self.labels = labels

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
    def __init__(self, article, abstract_sentences, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
        #Leena change article and abstract_sentences here if you want to do a quick decoding test

        article = article.lower()

        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        #Leena: Trying to uderstand the data pipeline
        #    print("article_words: %s"%article_words) #word tokens; this would be used further in the code

        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]

        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string

        #Leena: used this to save summariies in a file to train the LM
        #    if hps.mode == 'train':
        #        with open("/home/leena/Documents/thesis/pointer-gen/pointer-generator-master/nlm/train_data.txt", "a") as myfile:
        #            myfile.write("\n%s\n"%abstract)

        #    if hps.mode == 'eval':
        #        with open("/home/leena/Documents/thesis/pointer-gen/pointer-generator-master/nlm/valid_data.txt", "a") as myfile:
        #            myfile.write("\n%s\n"%abstract)
        #
        #    if hps.mode == 'decode':
        #        with open("/home/leena/Documents/thesis/pointer-gen/pointer-generator-master/nlm/test_data.txt", "a") as myfile:
        #            myfile.write("\n%s\n"%abstract)

        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 25
0
    def __init__(self, article, abstract_sentences, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.
    # example初始化,一个example是一个article-abstract对
    Args:
      article: source text; a string. each token is separated by a single space. (src text,用string表示)
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.(abstract sentences列表,用string表示)
      vocab: Vocabulary object
      hps: hyperparameters
    """
        self.hps = hps

        # Get ids of special tokens
        # 得到start<s>和stop</s>的id
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        # article string数组
        article_words = article.split()
        # 如果超过则裁剪
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]
        # padding之前的长度
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        # article word ids,oov表示为unk
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        # abstract word ids,oov表示为unk
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        # decoder的input是<s> + seq,target是seq + </s>
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        # dec input的长度(padding前)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            #如果定义了pointer gen,则得到扩展vocab(id list)和article中的oov(str list)
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            # 得到abstract的ids(extend版本)
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            # 覆盖target
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 26
0
    def __init__(self, reviews, ratings, answer_sentences, question, label,
                 vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      reviews: review text; a list.
      ratings: alist.
      answer_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the reviews
        self.r_lens = []
        self.r_batch = []
        self.rating_batch = ratings
        reviews_words = []
        for review in reviews:
            review_words = review.split()
            if len(review_words) > hps.max_enc_steps:
                review_words = review_words[:hps.max_enc_steps]
            reviews_words.append(review_words)
            self.r_lens.append(
                len(review_words
                    ))  # store the length after truncation but before padding
            self.r_batch.append(
                [vocab.word2id(w) for w in review_words]
            )  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        answer = ' '.join(answer_sentences)  # string
        answer_words = answer.split()  # list of strings
        ans_ids = [
            vocab.word2id(w) for w in answer_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the question
        question_words = question.split()
        self.q_lens = len(question_words)
        self.q_batch = [vocab.word2id(w) for w in question_words]

        # Process the label
        self.y_target = label

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            ans_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)
        assert self.dec_len > 0

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:
            self.oovs = []
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.reviews_extend_vocab = []
            for review_words in reviews_words:
                review_extend_vocab, self.oovs = data.article2ids(
                    review_words, vocab, self.oovs)
                self.reviews_extend_vocab.append(review_extend_vocab)

            # question OOV id.
            self.question_extend_vocab, self.oovs = data.article2ids(
                question_words, vocab, self.oovs)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            ans_ids_extend_vocab = data.abstract2ids(answer_words, vocab,
                                                     self.oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                ans_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_reviews = reviews
        self.original_answer = answer
        self.original_answer_sents = answer_sentences
        self.original_question = question
Esempio n. 27
0
    def __init__(self, article, abstract_sentences, all_abstract_sentences,
                 doc_indices, raw_article_sents, ssi, article_lcs_paths_list,
                 vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

        Args:
            article: source text; a string. each token is separated by a single space.
            abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
            vocab: Vocabulary object
            hps: hyperparameters
        """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # # Process the article
        # article_words = article.split()
        # if len(article_words) > hps.max_enc_steps:
        #     article_words = article_words[:hps.max_enc_steps]
        # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:

            if raw_article_sents is not None and len(raw_article_sents) > 0:
                # self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents]
                self.tokenized_sents = [
                    util.process_sent(sent, whitespace=True)
                    for sent in raw_article_sents
                ]
                if self.hps.sep:
                    for sent in self.tokenized_sents[:-1]:
                        sent.append(data.SEP_TOKEN)

                # Process the article
                article_words = util.flatten_list_of_lists(
                    self.tokenized_sents)
                if len(article_words) > hps.max_enc_steps:
                    article_words = article_words[:hps.max_enc_steps]
                self.enc_input = [
                    vocab.word2id(w) for w in article_words
                ]  # list of word ids; OOVs are represented by the id for UNK token

                if len(all_abstract_sentences) == 1:
                    doc_indices = [0] * len(article_words)

                self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(
                    self.tokenized_sents, vocab)
                self.enc_input_extend_vocab = util.flatten_list_of_lists(
                    self.word_ids_sents)
                if len(self.enc_input_extend_vocab) > hps.max_enc_steps:
                    self.enc_input_extend_vocab = self.enc_input_extend_vocab[:
                                                                              hps
                                                                              .
                                                                              max_enc_steps]
                self.enc_len = len(
                    self.enc_input_extend_vocab
                )  # store the length after truncation but before padding
            else:
                # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
                article_str = util.to_unicode(article)
                raw_article_sents = nltk.tokenize.sent_tokenize(article_str)
                self.tokenized_sents = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]

                # Process the article
                article_words = util.flatten_list_of_lists(
                    self.tokenized_sents)
                if len(article_words) > hps.max_enc_steps:
                    article_words = article_words[:hps.max_enc_steps]
                self.enc_input = [
                    vocab.word2id(w) for w in article_words
                ]  # list of word ids; OOVs are represented by the id for UNK token

                if len(all_abstract_sentences) == 1:
                    doc_indices = [0] * len(article_words)

                self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(
                    self.tokenized_sents, vocab)
                self.enc_input_extend_vocab = util.flatten_list_of_lists(
                    self.word_ids_sents)
                # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)
                if len(self.enc_input_extend_vocab) > hps.max_enc_steps:
                    self.enc_input_extend_vocab = self.enc_input_extend_vocab[:
                                                                              hps
                                                                              .
                                                                              max_enc_steps]
                self.enc_len = len(
                    self.enc_input_extend_vocab
                )  # store the length after truncation but before padding

            if self.hps.word_imp_reg:
                self.enc_importances = self.get_enc_importances(
                    self.tokenized_sents, abstract_words)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        if ssi is not None:
            # Translate the similar source indices into masks over the encoder input
            self.ssi_masks = []
            for source_indices in ssi:
                ssi_sent_mask = [0.] * len(raw_article_sents)
                for source_idx in source_indices:
                    if source_idx >= len(ssi_sent_mask):
                        a = 0
                    ssi_sent_mask[source_idx] = 1.
                ssi_mask = pg_mmr_functions.convert_to_word_level(
                    ssi_sent_mask, self.tokenized_sents)
                self.ssi_masks.append(ssi_mask)

            summary_sent_tokens = [
                sent.strip().split() for sent in abstract_sentences
            ]
            if self.hps.ssi_data_path is None and len(
                    self.ssi_masks) != len(summary_sent_tokens):
                raise Exception(
                    'len(self.ssi_masks) != len(summary_sent_tokens)')

            self.sent_indices = pg_mmr_functions.convert_to_word_level(
                list(range(len(summary_sent_tokens))),
                summary_sent_tokens).tolist()

        if article_lcs_paths_list is not None:
            if len(article_lcs_paths_list) > 1:
                raise Exception('Need to implement for non-sent_dataset')
            article_lcs_paths = article_lcs_paths_list[0]
            imp_mask = [0] * len(article_words)
            to_add = 0
            for source_idx, word_indices_list in enumerate(article_lcs_paths):
                if source_idx > 0:
                    to_add += len(self.tokenized_sents[source_idx - 1])
                for word_idx in word_indices_list:
                    if word_idx + to_add >= len(imp_mask):
                        if len(imp_mask) == hps.max_enc_steps:
                            continue
                        else:
                            print(self.tokenized_sents, article_lcs_paths)
                            raise Exception(
                                'word_idx + to_add (%d) is larger than imp_mask size (%d)'
                                % (word_idx + to_add, len(imp_mask)))
                    imp_mask[word_idx + to_add] = 1
            self.importance_mask = imp_mask

        # Store the original strings
        self.original_article = article
        self.raw_article_sents = raw_article_sents
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
        self.all_original_abstract_sents = all_abstract_sentences

        self.doc_indices = doc_indices
        self.ssi = ssi
        self.article_lcs_paths_list = article_lcs_paths_list
Esempio n. 28
0
  def __init__(self, article, abstract_sentences, vocab, hps):
    """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
    self.hps = hps

    # Get ids of special tokens
    start_decoding = vocab.word2id(data.START_DECODING)
    stop_decoding = vocab.word2id(data.STOP_DECODING)

    # Process the article
    # process sent by viet nguyen
    sent_text = nltk.sent_tokenize(article)
    list_number_sent=[]
    article_words = []

    for sentence in sent_text:
        split_sent = sentence.split()
        try:
          number_sent = int((split_sent[0][1:-1]))
        except ValueError:
          number_sent = 100
        s = split_sent[1:]
        
        len_sent = len(s)
        for id in range(len_sent):
            list_number_sent.append(number_sent)
            article_words.append(s[id])
    # article_words = article.split()
    # caculate tf
    word_dict = set(article_words)
    wordDictA = dict.fromkeys(word_dict, 0)

    for sentence in sent_text:
        split_sent = sentence.split() 
        try:
          number_sent = int((split_sent[0][1:-1]))
        except ValueError:
          number_sent = 100
        s = split_sent[1:]
        for word in s:
            wordDictA[word]+=1

    tf_dict = {}
    tf_list=[]
    for word, count in wordDictA.items():
        tf_dict[word] = count/float(len(article_words))
        tf_list.append(tf_dict[word])
    if len(tf_list) > hps.max_enc_steps:
      tf_list = tf_list[:hps.max_enc_steps]
    if len(list_number_sent) > hps.max_enc_steps:
      list_number_sent = list_number_sent[:hps.max_enc_steps]
    if len(article_words) > hps.max_enc_steps:
      article_words = article_words[:hps.max_enc_steps]
    self.enc_tf_list = tf_list
    self.enc_number_sent = list_number_sent
    self.enc_len = len(article_words) # store the length after truncation but before padding
    self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

    # Process the abstract
    abstract = ' '.join(abstract_sentences) # string
    abstract_words = abstract.split() # list of strings
    abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token

    # Get the decoder input sequence and target sequence
    self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
    self.dec_len = len(self.dec_input)

    # If using pointer-generator mode, we need to store some extra info
    if hps.pointer_gen:
      # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
      self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)

      # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
      abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

      # Overwrite decoder target sequence so it uses the temp article OOV ids
      _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding)

    # Store the original strings
    self.original_article = article
    self.original_abstract = abstract
    self.original_abstract_sents = abstract_sentences
Esempio n. 29
0
    def __init__(self, article, abstract_sentences, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        # print ("Example __init__ article_words: ", len(article_words))# list of str
        # print ("__init__ hps.max_enc_steps: ", hps.max_enc_steps) # train flag
        if len(article_words) > hps.max_enc_steps.value:
            article_words = article_words[:hps.max_enc_steps.value]
        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token
        # print ("Example __init__ self.enc_len: ", self.enc_len) # int
        # print ("Example __init__ self.enc_input: ", len(self.enc_input)) # list of int

        # Process the abstract
        # print ("abstract_sentences: ", abstract_sentences)
        if hps.use_doc_vec.value:
            abstract_sentences_list = abstract_sentences[0].split()
            subred_tag = abstract_sentences_list[0]
            # print ("subred_tag: ", subred_tag)
            abstract_sentences = [' '.join(abstract_sentences_list[1:])]
            # print ("abstract_sentences: ", abstract_sentences)

        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps.value, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        # print("__init__ hps.pointer_gen: ", hps.pointer_gen) # train flag
        if hps.pointer_gen.value:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            # print ("__init__ hps.max_dec_steps: ", hps.max_dec_steps) # train flag
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps.value, start_decoding,
                stop_decoding)

        # Store the original strings
        if hps.use_doc_vec.value:
            self.subred_tag = SUBRED_TABLE[subred_tag]
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 30
0
    def __init__(self, article, abstract_sentences, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

    Args:
      article: source text; a string. each token is separated by a single space.
      abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
      vocab: Vocabulary object
      hps: hyperparameters
    """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        # Need to shuffle only the sentences within the hps.max_enc_steps.
        original_article_clean = article
        sentences = article.split('\n')
        if (hps.keep_stopwords < 1.0) or (hps.keep_word < 1.0):
            for i, sent in enumerate(sentences):
                sent_processed = []
                for word in sent.split(' '):
                    # Remove stopwords with specified probability
                    if hps.keep_stopwords < 1.0:
                        if (word.lower() in stopwords) and (
                                random.random() > hps.keep_stopwords):
                            continue
                    # Remove any word with specified probability.
                    if hps.keep_word < 1.0:
                        if (random.random() > hps.keep_word):
                            continue
                    sent_processed.append(word)
                sentences[i] = ' '.join(sent_processed)

        article = '\n'.join(sentences)

        if hps.shuffle_sentences:
            sentences = article.split('\n')
            token_counter = 0
            for idx, sent in enumerate(sentences):
                token_counter += len(sent.split())
                if token_counter >= hps.max_enc_steps:
                    sentences[idx] = ' '.join(sent.split()[:hps.max_enc_steps -
                                                           token_counter])
                    break
            sentences = sentences[:idx + 1]

            sentences = [
                sent for sent in sentences if (sent != '\n' and sent != '')
            ]
            random.shuffle(sentences)
            article_words = ' '.join(sentences).split()

        else:
            article_words = article.split()
            if len(article_words) > hps.max_enc_steps:
                article_words = article_words[:hps.max_enc_steps]

        self.enc_len = len(
            article_words
        )  # store the length after truncation but before padding
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:
            # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
            self.enc_input_extend_vocab, self.article_oovs = data.article2ids(
                article_words, vocab)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article_clean = original_article_clean
        self.original_article = article
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
Esempio n. 31
0
 def get_sent_tokens(sent):
     words = sent.split()
     return data.abstract2ids(words, vocab, art_oovs)
Esempio n. 32
0
    def __init__(self, article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

        Args:
            article: source text; a string. each token is separated by a single space.
            abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
            vocab: Vocabulary object
            hps: hyperparameters
        """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)


        # # Process the article
        # article_words = article.split()
        # if len(article_words) > hps.max_enc_steps:
        #     article_words = article_words[:hps.max_enc_steps]
        # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences) # string
        abstract_words = abstract.split() # list of strings
        abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:

            if raw_article_sents is not None and len(raw_article_sents) > 0:
                # self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents]
                self.tokenized_sents = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents]

                # Process the article
                article_words = util.flatten_list_of_lists(self.tokenized_sents)
                if len(article_words) > hps.max_enc_steps:
                    article_words = article_words[:hps.max_enc_steps]
                self.enc_input = [vocab.word2id(w) for w in
                                  article_words]  # list of word ids; OOVs are represented by the id for UNK token

                if len(all_abstract_sentences) == 1:
                    doc_indices = [0] * len(article_words)

                self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(self.tokenized_sents, vocab)
                self.enc_input_extend_vocab = util.flatten_list_of_lists(self.word_ids_sents)
                if len(self.enc_input_extend_vocab) > hps.max_enc_steps:
                    self.enc_input_extend_vocab = self.enc_input_extend_vocab[:hps.max_enc_steps]
                self.enc_len = len(self.enc_input_extend_vocab) # store the length after truncation but before padding
            else:
                # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
                article_str = util.to_unicode(article)
                raw_article_sents = nltk.tokenize.sent_tokenize(article_str)
                self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents]

                # Process the article
                article_words = util.flatten_list_of_lists(self.tokenized_sents)
                if len(article_words) > hps.max_enc_steps:
                    article_words = article_words[:hps.max_enc_steps]
                self.enc_input = [vocab.word2id(w) for w in
                                  article_words]  # list of word ids; OOVs are represented by the id for UNK token

                if len(all_abstract_sentences) == 1:
                    doc_indices = [0] * len(article_words)

                self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(self.tokenized_sents, vocab)
                self.enc_input_extend_vocab = util.flatten_list_of_lists(self.word_ids_sents)
                # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)
                if len(self.enc_input_extend_vocab) > hps.max_enc_steps:
                    self.enc_input_extend_vocab = self.enc_input_extend_vocab[:hps.max_enc_steps]
                self.enc_len = len(self.enc_input_extend_vocab) # store the length after truncation but before padding

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding)

        # Store the original strings
        self.original_article = article
        self.raw_article_sents = raw_article_sents
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
        self.all_original_abstract_sents = all_abstract_sentences

        self.doc_indices = doc_indices
        self.ssi = ssi