Beispiel #1
0
def get_variables(batch, vocab, dec_max_len, use_cuda=True):
    """
    Args:
        - **batch**: (list, list) each list is a batch of variable-length sequence
    Outputs:
        Variables for network
    """
    post_ids = [sentence2id(sent, vocab) for sent in batch[0]]
    # add GO
    response_ids = [[GO_ID] + sentence2id(sent, vocab) for sent in batch[1]]
    reference_ids = [sentence2id(sent, vocab) for sent in batch[1]]

    posts_var, posts_length = padding_inputs(post_ids, None)
    responses_var, responses_length = padding_inputs(response_ids, dec_max_len)
    # add EOS
    references_var, references_length = padding_inputs(reference_ids,
                                                       dec_max_len,
                                                       eos=True)

    # sort by post length
    posts_length, perms_idx = posts_length.sort(0, descending=True)
    posts_var = posts_var[perms_idx]
    responses_var = responses_var[perms_idx]
    responses_length = responses_length[perms_idx]
    references_var = references_var[perms_idx]
    references_length = references_length[perms_idx]

    if use_cuda:
        posts_var = posts_var.cuda()
        responses_var = responses_var.cuda()
        references_var = references_var.cuda()

    return posts_var, posts_length, responses_var, responses_length, references_var, references_length
Beispiel #2
0
def get_variables_cls(batch, vocab, dec_max_len, use_cuda=True):
    """
    Args:
        - **batch**: (list, list) each list is a batch of variable-length sequence
    Outputs:
        Variables for network
    """
    post_ids = [sentence2id(sent, vocab) for sent in batch[0]]
    reply_ids = [[GO_ID] + sentence2id(sent, vocab) for sent in batch[1]]

    posts_var, posts_length = padding_inputs(post_ids, None)
    reply_var, reply_length = padding_inputs(reply_ids, dec_max_len)
    lables = torch.FloatTensor(batch[2])

    # sort by post length
    posts_length, perms_idx = posts_length.sort(0, descending=True)
    posts_var = posts_var[perms_idx]
    reply_var = reply_var[perms_idx]
    reply_length = reply_length[perms_idx]
    lables = lables[perms_idx]

    if use_cuda:
        posts_var = posts_var.cuda()
        reply_var = reply_var.cuda()
        lables = lables.cuda()
    return posts_var, posts_length, reply_var, reply_length, lables
Beispiel #3
0
def chat(line):
    """ in test mode, we don't to create the backward path
    """
    global enc_vocab, inv_dec_vocab, model, saver, sess, output_file

    line = line.decode().lower()

    start = time.time()

    if len(line) > 0 and line[-1] == '\n':
        line = line[:-1]
    if line == '':
        return 'Hmm...'
    output_file.write('HUMAN: ' + line + '\n')
    # Get token-ids for the input sentence.
    token_ids = data.sentence2id(enc_vocab, str(line))
    if (len(token_ids) > max_length):
        return 'TL;DR'

    # Which bucket does it belong to?
    bucket_id = _find_right_bucket(len(token_ids))
    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
        [(token_ids, [])], bucket_id, batch_size=1)
    # Get output logits for the sentence.
    _, _, output_logits = run_step(sess, model, encoder_inputs, decoder_inputs,
                                   decoder_masks, bucket_id, True)
    response = _construct_response(output_logits, inv_dec_vocab)
    # print(response)
    output_file.write('BOT: ' + response + '\n')

    print(time.time() - start)
    return response
def chat():
    """ in test mode, we don't to create the backward path
    """
    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = open(
            os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print(
            'Welcome to TensorBro. Say something. Enter to exit. Max length is',
            max_length)
        # store a line history for 3 lines
        conversation_history = []
        line_history = ['', '', '']
        while True:
            line = _get_user_input()
            if len(line) > 0 and line[-1] == '\n':
                line = line[:-1]
                # update the line_history
                line_history.append(line)
                line_history.pop(0)
                # create line from the line history
                line = ''.join(line_history)
            if line == '':
                break
            output_file.write('HUMAN ++++ ' + line + '\n')
            # Get token-ids for the input sentence.
            token_ids = data.sentence2id(enc_vocab, str(line))
            if (len(token_ids) > max_length):
                print('Max length I can handle is:', max_length)
                line = _get_user_input()
                continue
            # Which bucket does it belong to?
            bucket_id = _find_right_bucket(len(token_ids))
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                [(token_ids, [])], bucket_id, batch_size=1)
            # Get output logits for the sentence.
            _, _, output_logits = run_step(sess, model, encoder_inputs,
                                           decoder_inputs, decoder_masks,
                                           bucket_id, True)
            response = _construct_response(output_logits, inv_dec_vocab)
            print(response)
            conversation_history.append((line, response))
            output_file.write('BOT ++++ ' + response + '\n')
        output_file.write('=============================================\n')
        output_file.close()
Beispiel #5
0
def chat(use_attention, ckpt_path="./ckp-dir/checkpoints"):
    """ in test mode, we don't to create the backward path
    """
    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    if not use_attention:
        model = BasicChatBotModel(batch_size=1)
    else:
        model = AttentionChatBotModel(batch_size=1)
    model.build()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver, ckpt_path)
        output_file = open(os.path.join(
            config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print(
            'Welcome to TensorBro. Say something. Enter to exit. Max length is', max_length)
        while True:
            line = _get_user_input()
            if len(line) > 0 and line[-1] == b'\n':
                line = line[:-1]
            if line == b'':
                break
            output_file.write('HUMAN ++++ ' + line.decode('ascii') + '\n')
            # Get token-ids for the input sentence.
            token_ids = data.sentence2id(enc_vocab, line)
            if len(token_ids) > max_length:
                print('Max length I can handle is:', max_length)
                line = _get_user_input()
                continue
            # Which bucket does it belong to?
            # bucket_id = _find_right_bucket(len(token_ids))
            bucket_id = -1
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch([(token_ids, [])],
                                                                           bucket_id,
                                                                           batch_size=1)
            # Get output logits for the sentence.
            decoder_lens = np.sum(np.transpose(np.array(decoder_masks), (1, 0)), axis=1)
            output_logits = sess.run([model.final_outputs],
                                     feed_dict={model.encoder_inputs_tensor: encoder_inputs,
                                                model.decoder_inputs_tensor: decoder_inputs,
                                                model.decoder_length_tensor: decoder_lens,
                                                model.bucket_length: config.BUCKETS[bucket_id]})
            response = _construct_response(output_logits, inv_dec_vocab)
            print(response)
            output_file.write('BOT ++++ ' + response + '\n')
        output_file.write('=============================================\n')
        output_file.close()
Beispiel #6
0
def chat():
    """ in test mode, we don't to create the backward path
    """
    # index2word , word2index
    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    model = ChatBotModel(True, batch_size=1)  # 배치 사이즈는 하나 (forward only)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = open(
            os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]  # 유저가 타이핑할 수 있는 최대 길이는 버킷의 최대길이
        print(
            'Welcome to TensorBro. Say something. Enter to exit. Max length is',
            max_length)
        while True:
            line = _get_user_input()  # 시스템 인풋을 받아온다
            if len(line) > 0 and line[-1] == '\n':
                line = line[:-1]
            if line == '':  # 아무것도 타이핑 안하면 브레이크
                break
            output_file.write('HUMAN ++++ ' + line + '\n')  # 아웃풋 파일에 한줄씩 기록
            # Get token-ids for the input sentence.
            token_ids = data.sentence2id(enc_vocab, str(line))  # 문장 하나를 index로
            if (len(token_ids) > max_length):  # 만약 최대 길이보다 더 받았으면 다시 타이핑 받게 한다
                print('Max length I can handle is:', max_length)
                line = _get_user_input()
                continue
            # Which bucket does it belong to?
            bucket_id = _find_right_bucket(
                len(token_ids))  # 입력 시퀀스의 길이에 맞는 버킷(최소) id 골라온다
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                [(token_ids, [])],  # 디코더 인풋은 x 전부 패딩되서 들어가는듯
                bucket_id,
                batch_size=1)
            # Get output logits for the sentence.
            _, _, output_logits = run_step(sess, model, encoder_inputs,
                                           decoder_inputs, decoder_masks,
                                           bucket_id,
                                           True)  # forward_only == True
            response = _construct_response(
                output_logits, inv_dec_vocab)  # id2word로 복구해서 다시 리스폰스로
            print(response)
            output_file.write('BOT ++++ ' + response + '\n')
        output_file.write('=============================================\n')
        output_file.close()
Beispiel #7
0
def handle_client(client, enc_vocab, inv_dec_vocab, model, saver, sess,
                  output_file):  # Takes client socket as argument.
    """Handles a single client connection."""
    name = client.recv(BUFSIZ).decode("utf8")
    max_length = config.BUCKETS[-1][0]
    msg = 'Welcome %s! Max length is %d. If you ever want to quit, type {quit} to exit.' % (
        name, max_length)
    client.send(bytes(msg, "utf8"))
    msg = "%s has joined the chat!" % name
    broadcast(bytes(msg, "utf8"))
    clients[client] = name
    # Decode from standard input.
    while True:
        msg = client.recv(BUFSIZ)

        #try:
        #print(u'%s: ' % name + msg.decode("utf8"))
        #except OSError:
        #print(u'%s: ' % name)

        if msg != bytes("{quit}", "utf8"):
            broadcast(msg, name + ": ")
        else:
            print(u"%s has left the chat." % name)
            #client.send(bytes("{quit}", "utf8"))
            client.close()
            del clients[client]

            broadcast(bytes("%s has left the chat." % name, "utf8"))
            break
        output_file.write(u'HUMAN ++++ ' + msg.decode("utf8") + '\n')
        # Get token-ids for the input sentence.
        token_ids = data.sentence2id(enc_vocab, msg.decode("utf8"))

        #print(token_ids)
        if (len(token_ids) > max_length):
            broadcast(
                bytes("Max length I can handle is %d" % max_length, "utf8"))
            continue
        # Which bucket does it belong to?
        bucket_id = _find_right_bucket(len(token_ids))
        # Get a 1-element batch to feed the sentence to the model.
        encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
            [(token_ids, [])], bucket_id, batch_size=1)
        # Get output logits for the sentence.
        _, _, output_logits = run_step(sess, model, encoder_inputs,
                                       decoder_inputs, decoder_masks,
                                       bucket_id, True)
        response = _construct_response(output_logits, inv_dec_vocab)
        broadcast(bytes(response, "utf8"), "BOT: ")
        #print(response)
        output_file.write('BOT ++++ ' + response + '\n')
    output_file.write('=============================================\n')
    output_file.close()
Beispiel #8
0
def predict(demo_sent):
    print('============= 开始预测 =============')
    demo_id = sentence2id(demo_sent, word2id)
    length = len(demo_id)
    if length > args.max_len:
        print('Inputs is too long ')
    demo_data = [(demo_id, [0] * length)]

    print(demo_sent)
    tags = model.predict_sentence(sess, demo_data)
    print(tags[:length])
    return json.dumps(tags[:length])
Beispiel #9
0
def chat():
    """
    in test mode, we don't create the backward path
    :return:
    """
    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab_enc'))
    inv_dec_vocab, _ = data.load_vocab((os.path.join(config.PROCESSED_PATH,
                                                     'vocab_dec')))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = open(
            os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input
        max_length = config.BUCKETS[-1][0]
        print(
            'I am ChatBot. Proceed to chat. Enter of exit. Max length is {0}'.
            format(max_length))
        while True:
            line = _get_user_input()
            if len(line) > 0 and line[-1] == '\n':
                line = line[:-1]
            if line == '':
                break
            output_file.write('HUMAN ++++ ' + line + '\n')
            # Get token-ids for the input sentence.
            token_ids = data.sentence2id(enc_vocab, str(line))
            if len(token_ids) > max_length:
                print('Max length I can handle is: {0}'.format(max_length))
                line = _get_user_input()
                continue
            # Which bucket does this go in??
            bucket_id = _find_right_bucket(len(token_ids))
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                [(token_ids, [])], bucket_id, batch_size=1)
            # Get output logits for the sentence.
            _, _, output_logits = run_step(sess, model, encoder_inputs,
                                           decoder_inputs, decoder_masks,
                                           bucket_id, True)
            response = _construct_response((output_logits, inv_dec_vocab))
            print(response)
            output_file.write('BOT ++++ ' + response + '\n')
        output_file.write('===============================================\n')
        output_file.close()
Beispiel #10
0
def translate():
    """ in test mode, we don't to create the backward path
    """

    model = TranslationModel(True, batch_size=1)
    model.build_graph()

    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.en'))
    inv_dec_vocab, _ = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.vi'))

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = open(
            os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print('Type something. Enter to exit. Max length is', max_length)
        while True:
            line = _get_user_input()
            if len(line) > 0 and line[-1] == '\n':
                line = line[:-1]
            if line == '':
                break
            output_file.write('English ++++ ' + line + '\n')
            # Get token-ids for the input sentence.
            token_ids = data.sentence2id(enc_vocab, str(line))
            if (len(token_ids) > max_length):
                print('Max length I can handle is:', max_length)
                line = _get_user_input()
                continue
            # Which bucket does it belong to?
            bucket_id = _find_right_bucket(len(token_ids))
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                [(token_ids, [])], bucket_id, batch_size=1)
            # Get output logits for the sentence.
            _, _, output_logits = run_step(sess, model, encoder_inputs,
                                           decoder_inputs, decoder_masks,
                                           bucket_id, True)
            response = _construct_response(output_logits, inv_dec_vocab)
            print(response)
            output_file.write('Translation (Vietnamese) ++++ ' + response +
                              '\n')
        output_file.write('=============================================\n')
        output_file.close()
Beispiel #11
0
    def generate_answer(self, question):
        """Combines stackoverflow and chitchat parts using intent recognition."""

        # Recognize intent of the question using `intent_recognizer`.
        # Don't forget to prepare question and calculate features for the question.

        #prepared_question = #### YOUR CODE HERE ####
        #features = #### YOUR CODE HERE ####
        #intent = #### YOUR CODE HERE ####
        prepared_question = text_prepare(question)
        #### YOUR CODE HERE ####
        features = self.tfidf_vectorizer.transform([prepared_question])
        #### YOUR CODE HERE ####
        intent = self.intent_recognizer.predict(features)[0]

        # Chit-chat part:
        if intent == 'dialogue':
            # Pass question to chitchat_bot to generate a response.
            max_length = config.BUCKETS[-1][0]
            line = question
            if len(line) > 0 and line[-1] == '\n':
                line = line[:-1]

            token_ids = data.sentence2id(self.enc_vocab, str(line))
            if (len(token_ids) > max_length):
                response = "Max length I can handle is:" + str(max_length)

            bucket_id = _find_right_bucket(len(token_ids))

            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                [(token_ids, [])], bucket_id, batch_size=1)

            _, _, output_logits = run_step(self.sess, self.model,
                                           encoder_inputs, decoder_inputs,
                                           decoder_masks, bucket_id, True)
            response = _construct_response(output_logits, self.inv_dec_vocab)

            return response

        # Goal-oriented part:
        else:
            # Pass features to tag_classifier to get predictions.
            tag = self.tag_classifier.predict(features)[0]

            # Pass prepared_question to thread_ranker to get predictions.
            thread_id = self.thread_ranker.get_best_thread(
                prepared_question, tag)

        return self.ANSWER_TEMPLATE % (tag, thread_id)
Beispiel #12
0
def get_predicted_sentence(input_sentence, enc_vocab, inv_dec_vocab, model,
                           sess):
    """ in test mode, we don't to create the backward path
    """
    line = input_sentence
    token_ids = data.sentence2id(enc_vocab, line)
    bucket_id = _find_right_bucket(len(token_ids))
    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
        [(token_ids, [])], bucket_id, batch_size=1)
    decoder_inputs[0][0] = 2
    # Get output logits for the sentence.
    _, _, output_logits = run_step(sess, model, encoder_inputs, decoder_inputs,
                                   decoder_masks, bucket_id, True)
    response = _construct_response(output_logits, inv_dec_vocab)
    return response
Beispiel #13
0
	def chat(self, line):
		# Get token-ids for the input sentence.
		token_ids = data.sentence2id(self.__enc_vocab, str(line))

		if (len(token_ids) > self.max_length):
			return "Would you mind to be more concise? I can't understand"

		# Which bucket does it belong to?
		bucket_id = _find_right_bucket(len(token_ids))

		# Get a 1-element batch to feed the sentence to the model.
		encoder_inputs, decoder_inputs, decoder_masks = data.get_batch([(token_ids, [])], bucket_id, batch_size=1)

		# Get output logits for the sentence.
		_, _, output_logits = run_step(self.__session, self.__model, encoder_inputs, decoder_inputs, decoder_masks, bucket_id, True)
		return _construct_response(output_logits, self.__inv_dec_vocab)
Beispiel #14
0
def predict(test_post_file,
            max_len,
            vocab,
            rev_vocab,
            word_embeddings,
            encoder,
            generator,
            output_file=None):
    # data generator
    test_data_generator = batcher(1, test_post_file, response_file=None)

    if output_file:
        fo = open(output_file, 'wb')

    while True:
        try:
            post_sentence = test_data_generator.next()
        except StopIteration:
            logger.info('---------------------finish-------------------------')
            break

        post_ids = [sentence2id(sent, vocab) for sent in post_sentence]
        posts_var, posts_length = padding_inputs(post_ids, None)
        if USE_CUDA:
            posts_var = posts_var.cuda()

        embedded_post = word_embeddings(posts_var)
        _, dec_init_state = encoder(embedded_post,
                                    input_lengths=posts_length.numpy())
        log_softmax_outputs = generator.inference(
            dec_init_state, word_embeddings)  # [B, T, vocab_size]

        hyps, _ = beam_search(dec_init_state,
                              max_len,
                              word_embeddings,
                              generator,
                              beam=5,
                              penalty=1.0,
                              nbest=1)
        results = []
        for h in hyps:
            results.append(id2sentence(h[0], rev_vocab))

        print('*******************************************************')
        print "post:" + ''.join(post_sentence[0])
        print "response:\n" + '\n'.join([''.join(r) for r in results])
        print
Beispiel #15
0
def chat(input_cmd):
    """ in test mode, we don't to create the backward path
    """
    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = open(
            os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]

        # input_cmd = _get_user_input()
        if len(input_cmd) > 0 and input_cmd[-1] == '\n':
            input_cmd = input_cmd[:-1]

        output_file.write('Input: ' + input_cmd + '\n')
        # Get token-ids for the input sentence.
        token_ids = data.sentence2id(enc_vocab, str(input_cmd))
        if len(token_ids) > max_length:
            input_cmd = input_cmd[max_length]

        # Which bucket does it belong to?
        bucket_id = _find_right_bucket(len(token_ids))

        # Get a 1-element batch to feed the sentence to the model.
        encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
            [(token_ids, [])], bucket_id, batch_size=1)

        # Get output logits for the sentence.
        _, _, output_logits = run_step(sess, model, encoder_inputs,
                                       decoder_inputs, decoder_masks,
                                       bucket_id, True)
        response = _construct_response(output_logits, inv_dec_vocab)
        output_file.write('Response: ' + response + '\n')
        output_file.close()
        return response
def chat():
    """ in test mode, we don't to create the backward path
    """
    _, enc_vocab = data.load_vocab(os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = open(os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print('Welcome to TensorBro. Say something. Enter to exit. Max length is', max_length)
        while True:
            line = _get_user_input()
            if len(line) > 0 and line[-1] == '\n':
                line = line[:-1]
            if line == '':
                break
            output_file.write('HUMAN ++++ ' + line + '\n')
            # Get token-ids for the input sentence.
            token_ids = data.sentence2id(enc_vocab, str(line))
            if (len(token_ids) > max_length):
                print('Max length I can handle is:', max_length)
                line = _get_user_input()
                continue
            # Which bucket does it belong to?
            bucket_id = _find_right_bucket(len(token_ids))
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch([(token_ids, [])], 
                                                                            bucket_id,
                                                                            batch_size=1)
            # Get output logits for the sentence.
            _, _, output_logits = run_step(sess, model, encoder_inputs, decoder_inputs,
                                           decoder_masks, bucket_id, True)
            response = _construct_response(output_logits, inv_dec_vocab)
            print(response)
            output_file.write('BOT ++++ ' + response + '\n')
        output_file.write('=============================================\n')
        output_file.close()
Beispiel #17
0
def chat():
    
    _,enc_load = data.load_vocab(os.path.join(config.PROCESSED_PATH, 'vocab_enc'))
    inv_dec_vocab,_ = data.load_vocab(os.path.join(config.PROCESSED_PATH,'vocab.dec'))
    
    model = ChatBotModel(True,batch_size=1)
    model.build_graph()
    
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess,saver)
        output_file = open(os.path.join(config.PROCESSED_PATH,config.OUTPUT_FILE),'a+')
        max_length = config.BUCKETS[-1][0]
        print('Welcome to TensorBro, Say something. Enter to exit. Max length is ', max_length)
        while True:
            line = _get_user_input()
            if len(line) > 0 and line[-1] == '\n':
                line = line[:-1]
            if line == '':
                break
                
            output_file.write('HUMAN ++++ ' + line + '\n')
            
            token_ids = data.sentence2id(enc_vocab,str(line))
            if  (len(token_ids) > max_length):
                print('Max length I can handle is:' max_length)
                line = _get_user_input()
                continue
                
            bucket_id = _find_right_bucket(len(token_ids))
            
            encoder_inputs,decoder_inputs,decoder_masks = data.get_batch([(token_ids , [])],bucket_id,batch_size=1)
            
            _,_,output_logits = run_step(sess,model,encoder_inputs,decoder_inputs,decoder_masks,bucket_id,True)
            response = _construct_response(output_logits,inv_dec_vocab)
            
            print(response)
            
            output_file.write('BOT ++++ ' + response + '\n')
        output_file.write('===================================\n')
        output_file.close()
Beispiel #18
0
def translate():

    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)

        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print('Please enter sentence in English')
        while True:
            line = _get_user_input()
            if len(line) > 0 and line[-1] == u'\n':
                line = line[:-1]
            if line == '':
                break

            # Get token-ids for the input sentence.
            token_ids = data.sentence2id(enc_vocab, line)
            if (len(token_ids) > max_length):
                token_ids = token_ids[:max_length]

            # Which bucket does it belong to?
            bucket_id = _find_right_bucket(len(token_ids))
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                [(token_ids, [])], bucket_id, batch_size=1)
            # Get output logits for the sentence.
            _, _, output_logits = run_step(sess, model, encoder_inputs,
                                           decoder_inputs, decoder_masks,
                                           bucket_id, True)
            response = _construct_response(output_logits, inv_dec_vocab)
            print(response)
Beispiel #19
0
def chat(sess, input_text, enc_vocab, inv_dec_vocab, model):

    #     output_file = open(os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
    # Decode from standard input.
    max_length = config.BUCKETS[-1][0]
    #     print('Welcome to TensorBro. Say something. Enter to exit. Max length is', max_length)
    # while True:
    line = input_text
    # line = _get_user_input()
    #     if len(line) > 0 and line[-1] == '\n':
    #         line = line[:-1]
    #     output_file.write('HUMAN ++++ ' + line + '\n')
    # Get token-ids for the input sentence.
    token_ids = data.sentence2id(enc_vocab, str(line))
    #     if (len(token_ids) > max_length):
    #         print('Max length I can handle is:', max_length)
    #         line = _get_user_input()
    #         continue
    # Which bucket does it belong to?
    bucket_id = _find_right_bucket(len(token_ids))
    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
        [(token_ids, [])], bucket_id, batch_size=1)
    # Get output logits for the sentence.
    _, _, output_logits, _ = run_step(sess, model, encoder_inputs,
                                      decoder_inputs, decoder_masks, bucket_id,
                                      True)
    response = _construct_response(output_logits, inv_dec_vocab)
    print(response)
    if (len(response) == 0 or "UNK" in response):
        response = line + "เหรอ5555"


#     output_file.write('BOT ++++ ' + response + '\n')

#     output_file.write('=============================================\n')
#     output_file.close()

    return response
Beispiel #20
0
    def response(self, line):
        """ return a string response to a string input """

        # Decode from standard input.
        if line[-1] == '\n':
            line = line[:-1]

        # Get token-ids for the input sentence.
        token_ids = data.sentence2id(self.enc_vocab, str(line))
        if (len(token_ids) > self.max_length):
            raise RuntimeError('Max length the bot can handle is:',
                               self.max_length)

        # Which bucket does it belong to?
        bucket_id = _find_right_bucket(len(token_ids))
        # Get a 1-element batch to feed the sentence to the model.
        encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
            [(token_ids, [])], bucket_id, batch_size=1)
        # Get output logits for the sentence.
        _, _, output_logits = run_step(self.sess, self.model, encoder_inputs,
                                       decoder_inputs, decoder_masks,
                                       bucket_id, True)
        response = _construct_response(output_logits, self.inv_dec_vocab)
        return response
Beispiel #21
0
def pretrain():
    # Parse command line arguments
    argparser = argparse.ArgumentParser()

    # train
    argparser.add_argument('--mode',
                           '-m',
                           choices=('pretrain', 'adversarial', 'inference'),
                           type=str,
                           required=True)
    argparser.add_argument('--batch_size', '-b', type=int, default=168)
    argparser.add_argument('--num_epoch', '-e', type=int, default=10)
    argparser.add_argument('--print_every', type=int, default=100)
    argparser.add_argument('--use_cuda', default=True)
    argparser.add_argument('--g_learning_rate',
                           '-glr',
                           type=float,
                           default=0.001)
    argparser.add_argument('--d_learning_rate',
                           '-dlr',
                           type=float,
                           default=0.001)

    # resume
    argparser.add_argument('--resume', action='store_true', dest='resume')
    argparser.add_argument('--resume_dir', type=str)
    argparser.add_argument('--resume_epoch', type=int)

    # save
    argparser.add_argument('--exp_dir', type=str, required=True)

    # model
    argparser.add_argument('--emb_dim', type=int, default=128)
    argparser.add_argument('--hidden_dim', type=int, default=256)
    argparser.add_argument('--dropout_rate', '-drop', type=float, default=0.5)
    argparser.add_argument('--n_layers', type=int, default=1)
    argparser.add_argument('--response_max_len', type=int, default=15)

    # data
    argparser.add_argument('--train_query_file',
                           '-tqf',
                           type=str,
                           required=True)
    argparser.add_argument('--train_response_file',
                           '-trf',
                           type=str,
                           required=True)
    argparser.add_argument('--valid_query_file',
                           '-vqf',
                           type=str,
                           required=True)
    argparser.add_argument('--valid_response_file',
                           '-vrf',
                           type=str,
                           required=True)
    argparser.add_argument('--vocab_file', '-vf', type=str, default='')
    argparser.add_argument('--max_vocab_size', '-mv', type=int, default=100000)

    args = argparser.parse_args()

    # set up the output directory
    exp_dirname = os.path.join(args.exp_dir, args.mode,
                               time.strftime("%Y-%m-%d-%H-%M-%S"))
    os.makedirs(exp_dirname)

    # set up the logger
    tqdm_logging.config(logger,
                        os.path.join(exp_dirname, 'train.log'),
                        mode='w',
                        silent=False,
                        debug=True)

    if not args.vocab_file:
        logger.info("no vocabulary file")
        build_vocab(args.train_query_file,
                    args.train_response_file,
                    seperated=True)
        sys.exit()
    else:
        vocab, rev_vocab = load_vocab(args.vocab_file,
                                      max_vocab=args.max_vocab_size)

    vocab_size = len(vocab)

    word_embeddings = nn.Embedding(vocab_size,
                                   args.emb_dim,
                                   padding_idx=SYM_PAD)
    E = EncoderRNN(vocab_size,
                   args.emb_dim,
                   args.hidden_dim,
                   args.n_layers,
                   args.dropout_rate,
                   bidirectional=True,
                   variable_lengths=True)
    G = Generator(vocab_size,
                  args.response_max_len,
                  args.emb_dim,
                  2 * args.hidden_dim,
                  args.n_layers,
                  dropout_p=args.dropout_rate)

    if args.use_cuda:
        word_embeddings.cuda()
        E.cuda()
        G.cuda()

    loss_func = nn.NLLLoss(size_average=False)
    params = list(word_embeddings.parameters()) + list(E.parameters()) + list(
        G.parameters())
    opt = torch.optim.Adam(params, lr=args.g_learning_rate)

    logger.info('----------------------------------')
    logger.info('Pre-train a neural conversation model')
    logger.info('----------------------------------')

    logger.info('Args:')
    logger.info(str(args))

    logger.info('Vocabulary from ' + args.vocab_file)
    logger.info('vocabulary size: %d' % vocab_size)
    logger.info('Loading text data from ' + args.train_query_file + ' and ' +
                args.train_response_file)

    # resume training from other experiment
    if args.resume:
        assert args.resume_epoch >= 0, 'If resume training, please assign resume_epoch'
        reload_model(args.resume_dir, args.resume_epoch, word_embeddings, E, G)
        start_epoch = args.resume_epoch + 1
    else:
        start_epoch = 0

    # dump args
    with open(os.path.join(exp_dirname, 'args.pkl'), 'wb') as f:
        pickle.dump(args, f)

    for e in range(start_epoch, args.num_epoch):
        logger.info('---------------------training--------------------------')
        train_data_generator = batcher(args.batch_size, args.train_query_file,
                                       args.train_response_file)
        logger.info("Epoch: %d/%d" % (e, args.num_epoch))
        step = 0
        total_loss = 0.0
        total_valid_char = []
        cur_time = time.time()
        while True:
            try:
                post_sentences, response_sentences = train_data_generator.next(
                )
            except StopIteration:
                # save model
                save_model(exp_dirname, e, word_embeddings, E, G)
                # evaluation
                eval(args.valid_query_file, args.valid_response_file,
                     args.batch_size, word_embeddings, E, G, loss_func,
                     args.use_cuda, vocab, args.response_max_len)
                break

            post_ids = [sentence2id(sent, vocab) for sent in post_sentences]
            response_ids = [
                sentence2id(sent, vocab) for sent in response_sentences
            ]
            posts_var, posts_length = padding_inputs(post_ids, None)
            responses_var, responses_length = padding_inputs(
                response_ids, args.response_max_len)
            # sort by post length
            posts_length, perms_idx = posts_length.sort(0, descending=True)
            posts_var = posts_var[perms_idx]
            responses_var = responses_var[perms_idx]
            responses_length = responses_length[perms_idx]

            # 在sentence后面加eos
            references_var = torch.cat([
                responses_var,
                Variable(torch.zeros(responses_var.size(0), 1).long(),
                         requires_grad=False)
            ],
                                       dim=1)
            for idx, length in enumerate(responses_length):
                references_var[idx, length] = SYM_EOS

            # show case
            #for p, r, ref in zip(posts_var.data.numpy()[:10], responses_var.data.numpy()[:10], references_var.data.numpy()[:10]):
            #    print ''.join(id2sentence(p, rev_vocab))
            #    print ''.join(id2sentence(r, rev_vocab))
            #    print ''.join(id2sentence(ref, rev_vocab))
            #    print

            if args.use_cuda:
                posts_var = posts_var.cuda()
                responses_var = responses_var.cuda()
                references_var = references_var.cuda()

            embedded_post = word_embeddings(posts_var)
            embedded_response = word_embeddings(responses_var)

            _, dec_init_state = E(embedded_post,
                                  input_lengths=posts_length.numpy())
            log_softmax_outputs = G.supervise(
                embedded_response, dec_init_state,
                word_embeddings)  # [B, T, vocab_size]

            outputs = log_softmax_outputs.view(-1, vocab_size)
            mask_pos = mask(references_var).view(-1).unsqueeze(-1)
            masked_output = outputs * (mask_pos.expand_as(outputs))
            loss = loss_func(masked_output,
                             references_var.view(-1)) / (posts_var.size(0))

            opt.zero_grad()
            loss.backward()
            opt.step()

            total_loss += loss * (posts_var.size(0))
            total_valid_char.append(mask_pos)

            if step % args.print_every == 0:
                total_loss_val = total_loss.cpu().data.numpy()[0]
                total_valid_char_val = torch.sum(
                    torch.cat(total_valid_char, dim=1)).cpu().data.numpy()[0]
                logger.info(
                    'Step %5d: (per word) training perplexity %.2f (%.1f iters/sec)'
                    % (step, math.exp(total_loss_val / total_valid_char_val),
                       args.print_every / (time.time() - cur_time)))
                total_loss = 0.0
                total_valid_char = []
                total_case_num = 0
                cur_time = time.time()
            step = step + 1
Beispiel #22
0
def chat():
    """ in test mode, we don't to create the backward path
    """
    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = open(
            os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print(
            'Welcome to TensorBro. Say something. Enter to exit. Max length is',
            max_length)

        if config.BEAM_SEARCH:
            while True:
                line = _get_user_input()
                if len(line) > 0 and line[-1] == '\n':
                    line = line[:-1]
                if line == '':
                    break
                output_file.write('HUMAN ++++ ' + line + '\n')
                # Get token-ids for the input sentence.
                token_ids = data.sentence2id(enc_vocab, str(line))
                if (len(token_ids) > max_length):
                    print('Max length I can handle is:', max_length)
                    line = _get_user_input()
                    continue
                # Which bucket does it belong to?
                bucket_id = _find_right_bucket(len(token_ids))

                if config.ANTI_LM:
                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, decoder_inputs, target_weights = data.get_batch(
                        [(token_ids, [])], bucket_id, batch_size=1)
                    # do beam search and antilm together
                    # Get output logits for the sentence.
                    beams, new_beams, results = [(1, 0, {
                        'eos': 0,
                        'dec_inp': decoder_inputs,
                        'prob': 1,
                        'prob_ts': 1,
                        'prob_t': 1
                    })], [], [
                    ]  # initialize beams as (log_prob, empty_string, eos)
                    dummy_encoder_inputs = [
                        np.array([config.PAD_ID])
                        for _ in range(len(encoder_inputs))
                    ]

                    for dptr in range(len(decoder_inputs) - 1):
                        if dptr > 0:
                            target_weights[dptr] = [1.]
                            beams, new_beams = new_beams[:args.beam_size], []
                        if config.DEBUG: print("=====[beams]=====", beams)
                        heapq.heapify(beams)  # since we will remove something
                        for prob, _, cand in beams:
                            if cand['eos']:
                                results += [(prob, 0, cand)]
                                continue

                            # normal seq2seq
                            if config.DEBUG:
                                print(
                                    cand['prob'], " ".join([
                                        dict_lookup(inv_dec_vocab, w[0])
                                        for w in cand['dec_inp']
                                    ]))

                            # all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
                            _, _, all_prob_ts = run_step(
                                sess, model, encoder_inputs, cand['dec_inp'],
                                target_weights, bucket_id, True)
                            if config.ANTI_LM:
                                # anti-lm
                                #   all_prob_t  = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
                                _, _, all_prob_t = run_step(
                                    sess, model, dummy_encoder_inputs,
                                    cand['dec_inp'], target_weights, bucket_id,
                                    True)
                                # adjusted probability
                                all_prob = all_prob_ts - config.LAMBDA * np.array(
                                    all_prob_t
                                )  #+ args.n_bonus * dptr + random() * 1e-50
                            else:
                                all_prob_t = [0] * len(all_prob_ts)
                                all_prob = all_prob_ts

                            # suppress copy-cat (respond the same as input)
                            if dptr < len(token_ids):
                                all_prob[token_ids[dptr]] = all_prob[
                                    token_ids[dptr]] * 0.01

                            # for debug use
                            # if config.DEBUG: return all_prob, all_prob_ts, all_prob_t

                            # beam search
                            for c in np.argsort(
                                    all_prob)[::-1][:args.beam_size]:
                                new_cand = {
                                    'eos': (c == config.EOS_ID),
                                    'dec_inp':
                                    [(np.array([c]) if i == (dptr + 1) else k)
                                     for i, k in enumerate(cand['dec_inp'])],
                                    'prob_ts':
                                    cand['prob_ts'] * all_prob_ts[c],
                                    'prob_t':
                                    cand['prob_t'] * all_prob_t[c],
                                    'prob':
                                    cand['prob'] * all_prob[c],
                                }
                                new_cand = (
                                    new_cand['prob'], random(), new_cand
                                )  # stuff a random to prevent comparing new_cand

                                try:
                                    if (len(new_beams) < config.BEAM_SIZE):
                                        heapq.heappush(new_beams, new_cand)
                                    elif (new_cand[0] > new_beams[0][0]):
                                        heapq.heapreplace(new_beams, new_cand)
                                except Exception as e:
                                    print("[Error]", e)
                                    print("-----[new_beams]-----\n", new_beams)
                                    print("-----[new_cand]-----\n", new_cand)

                    results += new_beams  # flush last cands

                    # post-process results
                    res_cands = []
                    for prob, _, cand in sorted(results, reverse=True):
                        cand['dec_inp'] = " ".join([
                            dict_lookup(inv_dec_vocab, w)
                            for w in cand['dec_inp']
                        ])
                        print('response antilm: ', cand['dec_inp'])
                        res_cands.append(cand)
                    return res_cands[:args.beam_size]

                else:
                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                        [(token_ids, [])], bucket_id, batch_size=1)
                    # add beam search parameter to run_Step
                    path, symbol, output_logits = run_step(
                        sess, model, encoder_inputs, decoder_inputs,
                        decoder_masks, bucket_id, True)

                    k = output_logits[0]
                    paths = []
                    for kk in range(config.BEAM_SIZE):
                        paths.append([])
                    curr = range(config.BEAM_SIZE)
                    num_steps = len(path)
                    for i in range(num_steps - 1, -1, -1):
                        for kk in range(config.BEAM_SIZE):
                            paths[kk].append(symbol[i][curr[kk]])
                            curr[kk] = path[i][curr[kk]]
                    responses = set()
                    for kk in range(config.BEAM_SIZE):
                        response = _construct_beam_response(
                            paths[kk], inv_dec_vocab)
                        if response not in responses:
                            responses.add(response)
                            print('response: ', response)
                            output_file.write('BOT ++++ ' + response + '\n')
        else:
            while True:
                line = _get_user_input()
                if len(line) > 0 and line[-1] == '\n':
                    line = line[:-1]
                if line == '':
                    break
                output_file.write('HUMAN ++++ ' + line + '\n')
                # Get token-ids for the input sentence.
                token_ids = data.sentence2id(enc_vocab, str(line))
                if (len(token_ids) > max_length):
                    print('Max length I can handle is:', max_length)
                    line = _get_user_input()
                    continue
                # Which bucket does it belong to?
                bucket_id = _find_right_bucket(len(token_ids))
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = data.get_batch(
                    [(token_ids, [])], bucket_id, batch_size=1)
Beispiel #23
0
## demo
elif args.mode == 'demo':
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args,
                       embeddings,
                       dictname2id,
                       word2id,
                       paths,
                       config=config)
    model.build_graph()
    saver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        print('============= demo =============')
        saver.restore(sess, ckpt_file)
        while (1):
            print('Please input your sentence:')
            demo_sent = input()
            if demo_sent == '' or demo_sent.isspace():
                print('See you next time!')
                break
            else:
                demo_sent = list(demo_sent.strip())
                demo_data = [(sentence2id(demo_sent,
                                          word2id), [0] * len(demo_sent))]
                tag = model.demo_one(sess, demo_data)
                res = get_entity(tag[0], demo_sent, dictname2id)
                print(res)
Beispiel #24
0
elif args.mode == 'predict':
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args,
                       embeddings,
                       tag2label,
                       word2id,
                       paths,
                       config=configs)
    model.build_graph()
    saver = tf.train.Saver()
    with tf.Session(config=configs) as sess:
        print('============= demo =============')
        saver.restore(sess, ckpt_file)
        while 1:
            print('Please input your sentence:')
            demo_sent = input()
            if demo_sent == '' or demo_sent.isspace():
                print('See you next time!')
                break
            else:
                demo_id = sentence2id(demo_sent, word2id)
                length = len(demo_id)
                if length > args.max_len:
                    print('Inputs is too long ')
                demo_data = [(demo_id, [0] * length)]
                print(demo_id)
                tags = model.predict_sentence(sess, demo_data)
                print(tags[:length])
Beispiel #25
0
def adversarial():
    # user the root logger
    logger = logging.getLogger("lan2720")
    
    argparser = argparse.ArgumentParser(add_help=False)
    argparser.add_argument('--load_path', '-p', type=str, required=True)
    # TODO: load best
    argparser.add_argument('--load_epoch', '-e', type=int, required=True)
    
    argparser.add_argument('--filter_num', type=int, required=True)
    argparser.add_argument('--filter_sizes', type=str, required=True)

    argparser.add_argument('--training_ratio', type=int, default=2)
    argparser.add_argument('--g_learning_rate', '-glr', type=float, default=0.001)
    argparser.add_argument('--d_learning_rate', '-dlr', type=float, default=0.001)
    argparser.add_argument('--batch_size', '-b', type=int, default=168)
    
    # new arguments used in adversarial
    new_args = argparser.parse_args()
    
    # load default arguments
    default_arg_file = os.path.join(new_args.load_path, 'args.pkl')
    if not os.path.exists(default_arg_file):
        raise RuntimeError('No default argument file in %s' % new_args.load_path)
    else:
        with open(default_arg_file, 'rb') as f:
            args = pickle.load(f)
    
    args.mode = 'adversarial'
    #args.d_learning_rate  = 0.0001
    args.print_every = 1
    args.g_learning_rate = new_args.g_learning_rate
    args.d_learning_rate = new_args.d_learning_rate
    args.batch_size = new_args.batch_size

    # add new arguments
    args.load_path = new_args.load_path
    args.load_epoch = new_args.load_epoch
    args.filter_num = new_args.filter_num
    args.filter_sizes = new_args.filter_sizes
    args.training_ratio = new_args.training_ratio
    


    # set up the output directory
    exp_dirname = os.path.join(args.exp_dir, args.mode, time.strftime("%Y-%m-%d-%H-%M-%S"))
    os.makedirs(exp_dirname)

    # set up the logger
    tqdm_logging.config(logger, os.path.join(exp_dirname, 'adversarial.log'), 
                        mode='w', silent=False, debug=True)

    # load vocabulary
    vocab, rev_vocab = load_vocab(args.vocab_file, max_vocab=args.max_vocab_size)

    vocab_size = len(vocab)

    word_embeddings = nn.Embedding(vocab_size, args.emb_dim, padding_idx=SYM_PAD)
    E = EncoderRNN(vocab_size, args.emb_dim, args.hidden_dim, args.n_layers, args.dropout_rate, bidirectional=True, variable_lengths=True)
    G = Generator(vocab_size, args.response_max_len, args.emb_dim, 2*args.hidden_dim, args.n_layers, dropout_p=args.dropout_rate)
    D = Discriminator(args.emb_dim, args.filter_num, eval(args.filter_sizes))
    
    if args.use_cuda:
        word_embeddings.cuda()
        E.cuda()
        G.cuda()
        D.cuda()

    # define optimizer
    opt_G = torch.optim.Adam(G.rnn.parameters(), lr=args.g_learning_rate)
    opt_D = torch.optim.Adam(D.parameters(), lr=args.d_learning_rate)
    
    logger.info('----------------------------------')
    logger.info('Adversarial a neural conversation model')
    logger.info('----------------------------------')

    logger.info('Args:')
    logger.info(str(args))
    
    logger.info('Vocabulary from ' + args.vocab_file)
    logger.info('vocabulary size: %d' % vocab_size)
    logger.info('Loading text data from ' + args.train_query_file + ' and ' + args.train_response_file)
   
    
    reload_model(args.load_path, args.load_epoch, word_embeddings, E, G)
    #    start_epoch = args.resume_epoch + 1
    #else:
    #    start_epoch = 0

    # dump args
    with open(os.path.join(exp_dirname, 'args.pkl'), 'wb') as f:
        pickle.dump(args, f)


    # TODO: num_epoch is old one
    for e in range(args.num_epoch):
        train_data_generator = batcher(args.batch_size, args.train_query_file, args.train_response_file)
        logger.info("Epoch: %d/%d" % (e, args.num_epoch))
        step = 0
        cur_time = time.time() 
        while True:
            try:
                post_sentences, response_sentences = train_data_generator.next()
            except StopIteration:
                # save model
                save_model(exp_dirname, e, word_embeddings, E, G, D) 
                ## evaluation
                #eval(args.valid_query_file, args.valid_response_file, args.batch_size, 
                #        word_embeddings, E, G, loss_func, args.use_cuda, vocab, args.response_max_len)
                break
            
            # prepare data
            post_ids = [sentence2id(sent, vocab) for sent in post_sentences]
            response_ids = [sentence2id(sent, vocab) for sent in response_sentences]
            posts_var, posts_length = padding_inputs(post_ids, None)
            responses_var, responses_length = padding_inputs(response_ids, args.response_max_len)
            # sort by post length
            posts_length, perms_idx = posts_length.sort(0, descending=True)
            posts_var = posts_var[perms_idx]
            responses_var = responses_var[perms_idx]
            responses_length = responses_length[perms_idx]

            if args.use_cuda:
                posts_var = posts_var.cuda()
                responses_var = responses_var.cuda()

            embedded_post = word_embeddings(posts_var)
            real_responses = word_embeddings(responses_var)

            # forward
            _, dec_init_state = E(embedded_post, input_lengths=posts_length.numpy())
            fake_responses = G(dec_init_state, word_embeddings) # [B, T, emb_size]

            prob_real = D(embedded_post, real_responses)
            prob_fake = D(embedded_post, fake_responses)
        
            # loss
            D_loss = - torch.mean(torch.log(prob_real) + torch.log(1. - prob_fake)) 
            G_loss = torch.mean(torch.log(1. - prob_fake))
            
            if step % args.training_ratio == 0:
                opt_D.zero_grad()
                D_loss.backward(retain_graph=True)
                opt_D.step()
            
            opt_G.zero_grad()
            G_loss.backward()
            opt_G.step()
            
            if step % args.print_every == 0:
                logger.info('Step %5d: D accuracy=%.2f (0.5 for D to converge) D score=%.2f (-1.38 for G to converge) (%.1f iters/sec)' % (
                    step, 
                    prob_real.cpu().data.numpy().mean(), 
                    -D_loss.cpu().data.numpy()[0], 
                    args.print_every/(time.time()-cur_time)))
                cur_time = time.time()
            step = step + 1
Beispiel #26
0
def test():
    """ in test mode, we don't to create the backward path
    """
    _, enc_vocab = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    fh_test_truth = io.open(os.path.join(config.PROCESSED_PATH, 'test.dec'),
                            'r',
                            encoding='utf-8')
    fh_test_enc = io.open(os.path.join(config.PROCESSED_PATH, 'test.enc'),
                          'r',
                          encoding='utf-8')

    test_truths = fh_test_truth.readlines(
    )  # 1268 lines of correct translation in target language
    test_enc = fh_test_enc.readlines(
    )  # 1268 lines of initial sentence in source language

    fh_test_truth.close()
    fh_test_enc.close()

    bleu_scores = []

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = io.open(os.path.join(config.PROCESSED_PATH,
                                           config.OUTPUT_FILE),
                              'a+',
                              encoding='utf-8')

        # Decode from lines in test files.
        max_length = config.BUCKETS[-1][0]

        i = 0  # Index to be used to read ground_truth from test_truths

        for i in range(len(test_enc)):
            enc_line = test_enc[i]

            if len(enc_line) > 0 and enc_line[-1] == u'\n':
                enc_line = enc_line[:-1]
            if enc_line == '':
                break

            #output_file.write('HUMAN ++++ ' + enc_line + '\n')

            # Get token-ids for the input sentence.
            token_ids = data.sentence2id(enc_vocab, enc_line)
            if (len(token_ids) > max_length):
                print('Max length I can handle is:', max_length)
                output_file.write(u'.\n')
                bleu_score = 0
                bleu_scores.append(bleu_score)
                i = i + 1
                continue

            # Which bucket does it belong to?
            bucket_id = _find_right_bucket(len(token_ids))
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                [(token_ids, [])], bucket_id, batch_size=1)
            # Get output logits for the sentence.
            _, _, output_logits = run_step(sess, model, encoder_inputs,
                                           decoder_inputs, decoder_masks,
                                           bucket_id, True)
            response = _construct_response(
                output_logits,
                inv_dec_vocab)  # response is the translated sentence

            truth = test_truths[i]
            #print(type(truth))
            #print(type(response))

            truth_li = [truth.split()]
            response_li = response.split()

            bleu_score = sentence_bleu(
                truth_li,
                response_li,
                smoothing_function=SmoothingFunction().method1)

            print(response)
            output_file.write(response + '\n')
            i = i + 1
            #print("BLEU: %.5f" % bleu_score)
            bleu_scores.append(bleu_score)

        output_file.write(u'=============================================\n')
        output_file.write(u"Average BLEU: %.5f" %
                          np.mean(np.array(bleu_scores)))
        output_file.close()

        print("Average BLEU: %.5f" % np.mean(np.array(bleu_scores)))

        return bleu_scores
Beispiel #27
0
def chat():
    """ in test mode, we don't to create the backward path
    """
    _, enc_vocab = data.load_vocab(os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data.load_vocab(os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)
        output_file = open(os.path.join(config.PROCESSED_PATH, config.OUTPUT_FILE), 'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print('Welcome to TensorBro. Say something. Enter to exit. Max length is', max_length)

        if config.BEAM_SEARCH:
            while True:
                line = _get_user_input()
                if len(line) > 0 and line[-1] == '\n':
                    line = line[:-1]
                if line == '':
                    break
                output_file.write('HUMAN ++++ ' + line + '\n')
                # Get token-ids for the input sentence.
                token_ids = data.sentence2id(enc_vocab, str(line))
                if (len(token_ids) > max_length):
                    print('Max length I can handle is:', max_length)
                    line = _get_user_input()
                    continue
                # Which bucket does it belong to?
                bucket_id = _find_right_bucket(len(token_ids))
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, decoder_masks = data.get_batch([(token_ids, [])],
                                                                                bucket_id,
                                                                                batch_size=1)
                # add beam search parameter to run_Step
                path, symbol, output_logits = run_step(sess, model, encoder_inputs, decoder_inputs,
                                               decoder_masks, bucket_id, True)

                k = output_logits[0]
                paths = []
                for kk in range(config.BEAM_SIZE):
                  paths.append([])
                curr = range(config.BEAM_SIZE)
                num_steps = len(path)
                for i in range(num_steps-1, -1, -1):
                  for kk in range(config.BEAM_SIZE):
                    paths[kk].append(symbol[i][curr[kk]])
                    curr[kk] = path[i][curr[kk]]
                responses = set()
                for kk in range(config.BEAM_SIZE):
                    response = _construct_beam_response(paths[kk], inv_dec_vocab)
                    if response not in responses:
                      responses.add(response)
                      print('response: ', response)
                      output_file.write('BOT ++++ ' + response + '\n')
        else:
            while True:
                line = _get_user_input()
                if len(line) > 0 and line[-1] == '\n':
                    line = line[:-1]
                if line == '':
                    break
                output_file.write('HUMAN ++++ ' + line + '\n')
                # Get token-ids for the input sentence.
                token_ids = data.sentence2id(enc_vocab, str(line))
                if (len(token_ids) > max_length):
                    print('Max length I can handle is:', max_length)
                    line = _get_user_input()
                    continue
                # Which bucket does it belong to?
                bucket_id = _find_right_bucket(len(token_ids))
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, decoder_masks = data.get_batch([(token_ids, [])],
                                                                                bucket_id,
                                                                                batch_size=1)
                # Get output logits for the sentence.
                _, _, output_logits = run_step(sess, model, encoder_inputs, decoder_inputs,
                                               decoder_masks, bucket_id, True)

                if config.ANTI_LM:
                    dummy_encoder_inputs = [np.array([config.PAD_ID]) for _ in range(len(encoder_inputs))]
                    _, _, output_logits_t = run_step(sess, model, dummy_encoder_inputs, decoder_inputs,
                                                                   decoder_masks, bucket_id, True)
                    # only apply antilm up to a certain point in the decoder input
                    gamma = int(config.GAMMA*len(decoder_inputs))
                    antilm_mask = np.array([1*( _ < gamma) for _ in range(len(decoder_inputs))]).reshape((-1,1,1))
                    output_logits -= config.LAMBDA*(output_logits_t*antilm_mask)

                response = _construct_response(output_logits, inv_dec_vocab)
                print('response: ', response)
                output_file.write('BOT ++++ ' + response + '\n')
        output_file.write('=============================================\n')
        output_file.close()
Beispiel #28
0
def eval(valid_query_file, valid_response_file, batch_size, word_embeddings, E,
         G, loss_func, use_cuda, vocab, response_max_len):
    logger.info('---------------------validating--------------------------')
    logger.info('Loading valid data from %s and %s' %
                (valid_query_file, valid_response_file))

    valid_data_generator = batcher(batch_size, valid_query_file,
                                   valid_response_file)

    sum_loss = 0.0
    valid_char_num = 0
    example_num = 0
    while True:
        try:
            post_sentences, response_sentences = valid_data_generator.next()
        except StopIteration:
            # one epoch finish
            break

        post_ids = [sentence2id(sent, vocab) for sent in post_sentences]
        response_ids = [
            sentence2id(sent, vocab) for sent in response_sentences
        ]
        posts_var, posts_length = padding_inputs(post_ids, None)
        responses_var, responses_length = padding_inputs(
            response_ids, response_max_len)

        # sort by post length
        posts_length, perms_idx = posts_length.sort(0, descending=True)
        posts_var = posts_var[perms_idx]
        responses_var = responses_var[perms_idx]
        responses_length = responses_length[perms_idx]

        # 在sentence后面加eos
        references_var = torch.cat([
            responses_var,
            Variable(torch.zeros(responses_var.size(0), 1).long(),
                     requires_grad=False)
        ],
                                   dim=1)
        for idx, length in enumerate(responses_length):
            references_var[idx, length] = SYM_EOS

        if use_cuda:
            posts_var = posts_var.cuda()
            responses_var = responses_var.cuda()
            references_var = references_var.cuda()

        embedded_post = word_embeddings(posts_var)
        embedded_response = word_embeddings(responses_var)
        _, dec_init_state = E(embedded_post,
                              input_lengths=posts_length.numpy())
        log_softmax_outputs = G.supervise(
            embedded_response, dec_init_state,
            word_embeddings)  # [B, T, vocab_size]

        outputs = log_softmax_outputs.view(-1, len(vocab))
        mask_pos = mask(references_var).view(-1).unsqueeze(-1)
        masked_output = outputs * (mask_pos.expand_as(outputs))
        loss = loss_func(masked_output, references_var.view(-1))

        sum_loss += loss.cpu().data.numpy()[0]
        example_num += posts_var.size(0)
        valid_char_num += torch.sum(mask_pos).cpu().data.numpy()[0]

    logger.info(
        'Valid Loss (per case): %.2f Valid Perplexity (per word): %.2f' %
        (sum_loss / example_num, math.exp(sum_loss / valid_char_num)))
    logger.info('---------------------finish-------------------------')