Esempi in Python per ToSentences, esempi in Python per data.ToSentences

Esempio n. 1

0

Mostra file

    def get_article_inputs(self, article):

        pad_id = self._vocab.WordToId(data.PAD_TOKEN)

        article_sentences = [
            sent.strip()
            for sent in data.ToSentences(article, include_token=False)
        ]

        enc_inputs = []

        # Convert first N sentences to word IDs, stripping existing <s> and </s>.
        for i in xrange(
                min(self._max_article_sentences, len(article_sentences))):
            enc_inputs += data.GetWordIds(article_sentences[i], self._vocab)

        # Now len(enc_inputs) should be <= enc_timesteps, and
        # len(targets) = len(dec_inputs) should be <= dec_timesteps

        enc_input_len = len(enc_inputs)

        # Pad if necessary
        while len(enc_inputs) < self._hps.enc_timesteps:
            enc_inputs.append(pad_id)

        enc_batch = np.zeros((self._hps.batch_size, self._hps.enc_timesteps),
                             dtype=np.int32)
        enc_input_lens = np.zeros((self._hps.batch_size), dtype=np.int32)

        for i in xrange(self._hps.batch_size):
            enc_input_lens[i] = enc_input_len
            enc_batch[i, :] = enc_inputs[:]

        return (enc_batch, enc_input_lens)

Esempio n. 2

0

Mostra file

File: seq2seq_attention_decode_server.py Progetto: vino5211/SNLP-16-Scientific-Article-Summarization

    def _convertInputToModelTensor(self, article):
        print(article)
        article_sentences = [
            sent.strip()
            for sent in data.ToSentences(article, include_token=False)
        ]

        print(article_sentences)
        enc_inputs = []
        start_id = self._vocab.WordToId(data.SENTENCE_START)
        end_id = self._vocab.WordToId(data.SENTENCE_END)
        pad_id = self._vocab.WordToId(data.PAD_TOKEN)

        # Convert first N sentences to word IDs, stripping existing <s> and </s>.
        for i in xrange(
                min(FLAGS.max_article_sentences, len(article_sentences))):
            enc_inputs += data.GetWordIds(article_sentences[i], self._vocab)

        if (len(enc_inputs) > self._hps.enc_timesteps):
            tf.logging.warning('Truncating the example - too long.\nenc:%d\n',
                               len(enc_inputs))

        if len(enc_inputs) > self._hps.enc_timesteps:
            enc_inputs = enc_inputs[:self._hps.enc_timesteps]

        # Now len(enc_inputs) should be <= enc_timesteps, and

        enc_input_len = len(enc_inputs)

        # Pad if necessary
        while len(enc_inputs) < self._hps.enc_timesteps:
            enc_inputs.append(pad_id)
        print(enc_inputs)
        #return ([0]*self._hps.enc_timesteps, 50, " ")
        return (enc_inputs, enc_input_len, ' '.join(article_sentences))

Esempio n. 3

0

Mostra file

  def _Decode(self, article_text):
    """Restore a checkpoint and decode it.
    Args:
      saver: Tensorflow checkpoint saver.
      sess: Tensorflow session.
    Returns:
      If success, returns true, otherwise, false.
    """

    bs = beam_search.BeamSearch(
        self._model, self._hps.batch_size,
        self._vocab.WordToId(data.SENTENCE_START),
        self._vocab.WordToId(data.SENTENCE_END),
        self._hps.dec_timesteps)

    ###################
    #article_text = "How do I know the difference, between class and object"
    article = "<d><p><s>"+article_text+"</s></p></d>"
    article_sentences = [sent.strip() for sent in data.ToSentences(article, include_token=False)]
    pad_id = self._vocab.WordToId(data.PAD_TOKEN)

    enc_inputs = []
    for i in xrange(min(100,len(article_sentences))):
      enc_inputs += data.GetWordIds(article_sentences[i], self._vocab)

    enc_input_len = len(enc_inputs)
    while len(enc_inputs) < self._hps.enc_timesteps:
      enc_inputs.append(pad_id)
    ###################


    w, h = 120, 4
    article_batch_cp = [[0 for x in range(w)] for y in range(h)] 
    for i in range(0,4):
      article_batch_cp[i] = enc_inputs#article_batch[i]


    w, h = 1, 4
    article_lens_cp = [[0 for x in range(w)] for y in range(h)] 
    #article_lens_cp = article_lens.copy()
    for i in range(0,4):
      article_lens_cp[i] = enc_input_len

    best_beam = bs.BeamSearch(self._sess, article_batch_cp, article_lens_cp)
    #print len(best_beam)
    best_beam = best_beam[0]
     
    decode_output = [int(t) for t in best_beam.tokens[1:]]

    QUESTION = article_text

    test = ' '.join(data.Ids2Words(decode_output, self._vocab))
    end_p = test.find(data.SENTENCE_END, 0)

    if end_p != -1:
      test = test[:end_p]
    #print "<Answer>"+test
    ANSWER = test.replace('<UNK>','')

    return QUESTION, ANSWER

Esempio n. 4

0

Mostra file

File: batch_reader.py Progetto: willprice/tensorflow-model-zoo.torch

    def _FillInputQueue(self):
        """Fill input queue with ModelInput."""
        start_id = self._vocab.WordToId(data.SENTENCE_START)
        end_id = self._vocab.WordToId(data.SENTENCE_END)
        pad_id = self._vocab.WordToId(data.PAD_TOKEN)
        input_gen = self._TextGenerator(data.ExampleGen(self._data_path))
        while True:
            (article, abstract) = input_gen.next()
            article_sentences = [
                sent.strip()
                for sent in data.ToSentences(article, include_token=False)
            ]
            abstract_sentences = [
                sent.strip()
                for sent in data.ToSentences(abstract, include_token=False)
            ]

            enc_inputs = []
            # Use the <s> as the <GO> symbol for decoder inputs.
            dec_inputs = [start_id]

            # Convert first N sentences to word IDs, stripping existing <s> and </s>.
            for i in xrange(
                    min(self._max_article_sentences, len(article_sentences))):
                enc_inputs += data.GetWordIds(article_sentences[i],
                                              self._vocab)
            for i in xrange(
                    min(self._max_abstract_sentences,
                        len(abstract_sentences))):
                dec_inputs += data.GetWordIds(abstract_sentences[i],
                                              self._vocab)

            # Filter out too-short input
            if (len(enc_inputs) < self._hps.min_input_len
                    or len(dec_inputs) < self._hps.min_input_len):
                tf.logging.warning(
                    'Drop an example - too short.\nenc:%d\ndec:%d',
                    len(enc_inputs), len(dec_inputs))
                continue

            # If we're not truncating input, throw out too-long input
            if not self._truncate_input:
                if (len(enc_inputs) > self._hps.enc_timesteps
                        or len(dec_inputs) > self._hps.dec_timesteps):
                    tf.logging.warning(
                        'Drop an example - too long.\nenc:%d\ndec:%d',
                        len(enc_inputs), len(dec_inputs))
                    continue
            # If we are truncating input, do so if necessary
            else:
                if len(enc_inputs) > self._hps.enc_timesteps:
                    enc_inputs = enc_inputs[:self._hps.enc_timesteps]
                if len(dec_inputs) > self._hps.dec_timesteps:
                    dec_inputs = dec_inputs[:self._hps.dec_timesteps]

            # targets is dec_inputs without <s> at beginning, plus </s> at end
            targets = dec_inputs[1:]
            targets.append(end_id)

            # Now len(enc_inputs) should be <= enc_timesteps, and
            # len(targets) = len(dec_inputs) should be <= dec_timesteps

            enc_input_len = len(enc_inputs)
            dec_output_len = len(targets)

            # Pad if necessary
            while len(enc_inputs) < self._hps.enc_timesteps:
                enc_inputs.append(pad_id)
            while len(dec_inputs) < self._hps.dec_timesteps:
                dec_inputs.append(end_id)
            while len(targets) < self._hps.dec_timesteps:
                targets.append(end_id)

            element = ModelInput(enc_inputs, dec_inputs, targets,
                                 enc_input_len, dec_output_len,
                                 ' '.join(article_sentences),
                                 ' '.join(abstract_sentences))
            self._input_queue.put(element)

Esempio n. 5

0

Mostra file

File: batch_reader.py Progetto: zhuhongjinA/TensorFlow

    def _FillInputQueue(self):
        """Fill input queue with ModelInput.
    SENTENCE_START = '<s>'
    SENTENCE_END = '</s>'
    UNKNOWN_TOKEN = '<UNK>'
    PAD_TOKEN = '<PAD>'
    """
        start_id = self._vocab.WordToId(data.SENTENCE_START)
        end_id = self._vocab.WordToId(data.SENTENCE_END)
        pad_id = self._vocab.WordToId(data.PAD_TOKEN)
        input_gen = self._TextGenerator(data.ExampleGen(self._data_path))
        while True:
            (article, abstract) = six.next(input_gen)
            #得到一个个句子，每个句子开头以<s>开始，以</s>结束，当include_token为False时，将开始和结尾的<s>，</s>去掉了
            article_sentences = [
                sent.strip()
                for sent in data.ToSentences(article, include_token=False)
            ]
            abstract_sentences = [
                sent.strip()
                for sent in data.ToSentences(abstract, include_token=False)
            ]

            enc_inputs = []
            # Use the <s> as the <GO> symbol for decoder inputs.
            #在解码模块的输入最前方加上<s>
            dec_inputs = [start_id]

            # Convert first N sentences to word IDs, stripping existing <s> and </s>.
            for i in xrange(
                    min(self._max_article_sentences, len(article_sentences))):
                #将一句话变为一个向量
                enc_inputs += data.GetWordIds(article_sentences[i],
                                              self._vocab)
            for i in xrange(
                    min(self._max_abstract_sentences,
                        len(abstract_sentences))):
                dec_inputs += data.GetWordIds(abstract_sentences[i],
                                              self._vocab)

            # Filter out too-short input
            #句子长度太短
            if (len(enc_inputs) < self._hps.min_input_len
                    or len(dec_inputs) < self._hps.min_input_len):
                tf.logging.warning(
                    'Drop an example - too short.\nenc:%d\ndec:%d',
                    len(enc_inputs), len(dec_inputs))
                continue

            #句子太长
            if not self._truncate_input:
                if (len(enc_inputs) > self._hps.enc_timesteps
                        or len(dec_inputs) > self._hps.dec_timesteps):
                    tf.logging.warning(
                        'Drop an example - too long.\nenc:%d\ndec:%d',
                        len(enc_inputs), len(dec_inputs))
                    continue
            # If we are truncating input, do so if necessary
            else:
                if len(enc_inputs) > self._hps.enc_timesteps:
                    enc_inputs = enc_inputs[:self._hps.enc_timesteps]
                if len(dec_inputs) > self._hps.dec_timesteps:
                    dec_inputs = dec_inputs[:self._hps.dec_timesteps]

            # targets is dec_inputs without <s> at beginning, plus </s> at end
            #解码阶段的输入是dec_inputs，以<s>开始，目标targets以</s结束>
            targets = dec_inputs[1:]
            targets.append(end_id)

            # Now len(enc_inputs) should be <= enc_timesteps, and
            # len(targets) = len(dec_inputs) should be <= dec_timesteps

            enc_input_len = len(enc_inputs)
            dec_output_len = len(targets)

            # 如果比指定长度短，在此处填充,dec_inputs是[<s>,...],targets是[...,<\s>]
            while len(enc_inputs) < self._hps.enc_timesteps:
                enc_inputs.append(pad_id)  #<PAD>，enc_inputs不包含<s>,</s>
            while len(dec_inputs) < self._hps.dec_timesteps:
                dec_inputs.append(end_id)
            while len(targets) < self._hps.dec_timesteps:
                targets.append(end_id)
            #将nametupe放入队列之中
            #参数:enc_inputs是编码阶段的输入，dec_inputs是解码阶段的输入，targets是解码的输出目标
            element = ModelInput(enc_inputs, dec_inputs, targets,
                                 enc_input_len, dec_output_len,
                                 ' '.join(article_sentences),
                                 ' '.join(abstract_sentences))
            self._input_queue.put(element)