コード例 #1
0
ファイル: train.py プロジェクト: zcf131016/QA_LSTM_ATTENTION
def main():
    trained_model = "checkpoints/model.ckpt"
    embedding_size = 100  # Word embedding dimension
    epochs = 10
    batch_size = 64  # Batch data size
    rnn_size = 50  # Number of hidden layer neurons
    sequence_length = 300  # Sentence length
    learning_rate = 0.01  # Learning rate
    lrdownRate = 0.9
    margin = 0.1
    attention_matrix_size = 100
    gpu_mem_usage = 0.75
    gpu_device = "/gpu:0"
    cpu_device = "/cpu:0"

    embeddings, word2idx = data_helpers.load_embedding('vectors.nobin')
    voc = data_helpers.load_vocab('D:\\DataMining\\Datasets\\insuranceQA\\V1\\vocabulary')
    all_answers = data_helpers.load_answers('D:\\DataMining\\Datasets\\insuranceQA\\V1\\answers.label.token_idx', voc)
    questions, pos_answers, neg_answers = data_helpers.load_train_data('D:\\DataMining\\Datasets\\insuranceQA\\V1\\question.train.token_idx.label', all_answers, voc, word2idx, sequence_length)
    data_size = len(questions)
    permutation = np.random.permutation(data_size)
    questions = questions[permutation, :]
    pos_answers = pos_answers[permutation, :]
    neg_answers = neg_answers[permutation, :]
    with tf.Graph().as_default(), tf.device(gpu_device):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_mem_usage)
        session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
        model = QALSTM(batch_size, sequence_length, embeddings, embedding_size, rnn_size, margin, attention_matrix_size)
        with tf.Session(config=session_conf).as_default() as sess:  # config=session_conf
            saver = tf.train.Saver()

            print("Start training")
            sess.run(tf.global_variables_initializer())  # Initialize all variables
            for epoch in range(epochs):
                print("The training of the %s iteration is underway" % (epoch + 1))
                batch_number = 1
                for question, pos_answer, neg_answer in data_helpers.batch_iter(questions, pos_answers, neg_answers, batch_size):
                    start_time = time.time()
                    feed_dict = {
                        model.q: question,
                        model.ap: pos_answer,
                        model.an: neg_answer,
                        model.lr: learning_rate
                    }
                    _, loss, acc = sess.run([model.train_op, model.loss, model.acc], feed_dict)
                    duration = time.time() - start_time
                    print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f\tAcc %2.3f' % (epoch + 1, batch_number * batch_size, data_size, duration, loss, acc))
                    batch_number += 1
                learning_rate *= lrdownRate
                saver.save(sess, trained_model)
            print("End of the training")
コード例 #2
0
ファイル: test.py プロジェクト: zcf131016/QA_LSTM_ATTENTION
def main():
    trained_model = "checkpoints/model.ckpt"
    embedding_size = 100  # Word embedding dimension
    batch_size = 128  # Batch data size
    sequence_length = 300  # Sentence length
    rnn_size = 50  # Number of hidden layer neurons
    attention_matrix_size = 100
    margin = 0.1
    gpu_mem_usage = 0.75
    gpu_device = "/gpu:0"

    embeddings, word2idx = data_helpers.load_embedding('vectors.nobin')
    voc = data_helpers.load_vocab(
        'D:\\DataMining\\Datasets\\insuranceQA\\V1\\vocabulary')
    all_answers = data_helpers.load_answers(
        'D:\\DataMining\\Datasets\\insuranceQA\\V1\\answers.label.token_idx',
        voc)
    questions, answers, labels, qids, aids = data_helpers.load_test_data(
        'D:\\DataMining\\Datasets\\insuranceQA\\V1\\question.test1.label.token_idx.pool',
        all_answers, voc, word2idx, 300)
    with tf.Graph().as_default(), tf.device(gpu_device):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_mem_usage)
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      gpu_options=gpu_options)
        model = QALSTM(batch_size, sequence_length, embeddings, embedding_size,
                       rnn_size, margin, attention_matrix_size)
        with tf.Session(config=session_conf).as_default(
        ) as sess:  # config=session_conf
            saver = tf.train.Saver()
            print("Start loading the model")
            saver.restore(sess, trained_model)
            print("The model is loaded")
            scores = []
            for question, answer in data_helpers.test_batch_iter(
                    questions, answers, batch_size):
                feed_dict = {model.qtest: question, model.atest: answer}
                score = sess.run([model.scores], feed_dict)
                scores.extend(score[0].tolist())
            MAP, MRR = eval_map_mrr(qids, aids, scores, labels)
            print('MAP %2.3f\tMRR %2.3f' % (MAP, MRR))
コード例 #3
0
# FLAGS._parse_flags()
# FLAGS(sys.argv)

print("\nParameters:")
print(FLAGS)


# Load data
print("Loading data...")
trainset = Dataset('../../data/'+FLAGS.dataset+'/train.ss')
devset = Dataset('../../data/'+FLAGS.dataset+'/dev.ss')
testset = Dataset('../../data/'+FLAGS.dataset+'/test.ss')

alldata = np.concatenate([trainset.t_docs, devset.t_docs, testset.t_docs], axis=0)
embeddingpath = '../../data/'+FLAGS.dataset+'/embedding.txt'
embeddingfile, wordsdict = data_helpers.load_embedding(embeddingpath, alldata, FLAGS.embedding_dim)
del alldata
print("Loading data finished...")

usrdict, prddict = trainset.get_usr_prd_dict()
trainbatches = trainset.batch_iter(usrdict, prddict, wordsdict, FLAGS.n_class, FLAGS.batch_size,
                                 FLAGS.num_epochs, FLAGS.max_sen_len, FLAGS.max_doc_len)
devset.genBatch(usrdict, prddict, wordsdict, FLAGS.batch_size,
                  FLAGS.max_sen_len, FLAGS.max_doc_len, FLAGS.n_class)
testset.genBatch(usrdict, prddict, wordsdict, FLAGS.batch_size,
                  FLAGS.max_sen_len, FLAGS.max_doc_len, FLAGS.n_class)


with tf.Graph().as_default():
    session_config = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
コード例 #4
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--devset',
                        dest='devset',
                        action='store',
                        metavar='DEVSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    devset = dataset_walker.dataset_walker(args.devset,
                                           dataroot=args.dataroot,
                                           labels=True,
                                           translations=True)
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset_multitask(
        trainset, devset, testset)

    train_utters += dev_utters

    context_case = 1
    # 여기다가 previous labels context 를 구성하는 코드를 작성하자!
    # 1) 이전 화행 N개 (speaker 구분안함)
    # 2) 이전 턴의 상대방 발화들의 모든 화행 (n개)
    if context_case == 1:

        pass

    else:
        pass

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [[char for char in utter[0]] for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [[char for char in utter[0]] for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # build labels
    train_labels_category = [utter[3] for utter in train_utters]
    test_labels_category = [utter[3] for utter in test_utters]
    train_labels_attr = [utter[4] for utter in train_utters]
    test_labels_attr = [utter[4] for utter in test_utters]
    train_labels_sa = [utter[5] for utter in train_utters]
    test_labels_sa = [utter[5] for utter in test_utters]

    label_binarizer_category = preprocessing.MultiLabelBinarizer()
    label_binarizer_category.fit(train_labels_category + test_labels_category)

    label_binarizer_attr = preprocessing.MultiLabelBinarizer()
    label_binarizer_attr.fit(train_labels_attr + test_labels_attr)

    label_binarizer_sa = preprocessing.MultiLabelBinarizer()
    label_binarizer_sa.fit(train_labels_sa + test_labels_sa)

    train_labels_category = label_binarizer_category.transform(
        train_labels_category)
    test_labels_category = label_binarizer_category.transform(
        test_labels_category)
    train_labels_attr = label_binarizer_attr.transform(train_labels_attr)
    test_labels_attr = label_binarizer_attr.transform(test_labels_attr)
    train_labels_sa = label_binarizer_sa.transform(train_labels_sa)
    test_labels_sa = label_binarizer_sa.transform(test_labels_sa)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels_category = train_labels_category[
        tourist_train_indices]
    tourist_train_labels_attr = train_labels_attr[tourist_train_indices]
    tourist_train_labels_sa = train_labels_sa[tourist_train_indices]
    tourist_train_labels = (tourist_train_labels_category,
                            tourist_train_labels_attr, tourist_train_labels_sa)

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels_category = train_labels_category[guide_train_indices]
    guide_train_labels_attr = train_labels_attr[guide_train_indices]
    guide_train_labels_sa = train_labels_sa[guide_train_indices]
    guide_train_labels = (guide_train_labels_category, guide_train_labels_attr,
                          guide_train_labels_sa)

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels_category = test_labels_category[tourist_test_indices]
    tourist_test_labels_attr = test_labels_attr[tourist_test_indices]
    tourist_test_labels_sa = test_labels_sa[tourist_test_indices]
    tourist_test_labels = (tourist_test_labels_category,
                           tourist_test_labels_attr, tourist_test_labels_sa)

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels_category = test_labels_category[guide_test_indices]
    guide_test_labels_attr = test_labels_attr[guide_test_indices]
    guide_test_labels_sa = test_labels_sa[guide_test_indices]
    guide_test_labels = (guide_test_labels_category, guide_test_labels_attr,
                         guide_test_labels_sa)

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa,
                 tourist_train_inputs, tourist_train_labels,
                 tourist_test_inputs, tourist_test_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa,
                 guide_train_inputs, guide_train_labels, guide_test_inputs,
                 guide_test_labels)
コード例 #5
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        context_utters = []
        context_utter_str = '<PAD/>'
        context_labels = []
        context_label = ['INI_OPENING']
        last_speaker = None
        for (log_utter, translations, label_utter) in call:
            transcript = data_helpers.tokenize_and_lower(
                log_utter['transcript'])
            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))

            if last_speaker is not None and log_utter[
                    'speaker'] != last_speaker:
                if len(context_utters) > 0:
                    context_utter_str = ' <pause> '.join(context_utters)
                    context_label = context_labels[-1]
                else:
                    context_utter_str = '<PAD/>'
                    context_label = ['INI_OPENING']

                context_utters = []
                context_labels = []
                last_speaker = None

            if last_speaker is None or log_utter['speaker'] == last_speaker:
                context_utters += [transcript]  # cumulate context utters
                context_labels += [sa_label_list]

            last_speaker = log_utter['speaker']
            train_utters += [
                (transcript, context_utter_str, log_utter['speaker'],
                 sa_label_list, log_utter['utter_index'], context_label)
            ]
            # train_utters += [(transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)]

    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        context_utters = []
        context_utter_str = '<PAD/>'
        context_labels = []
        context_label = ['INI_OPENING']
        last_speaker = None
        for (log_utter, translations, label_utter) in call:
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))

            if last_speaker is not None and log_utter[
                    'speaker'] != last_speaker:
                if len(context_utters) > 0:
                    context_utter_str = ' <pause> '.join(context_utters)
                    context_label = context_labels[-1]
                else:
                    context_utter_str = ''
                    context_label = ['INI_OPENING']

                context_utters = []
                context_labels = []
                last_speaker = None

            if last_speaker is None or log_utter['speaker'] == last_speaker:
                context_utters += [translation]  # cumulate context utters
                context_labels += [sa_label_list]

            last_speaker = log_utter['speaker']

            test_utters += [
                (translation, context_utter_str, log_utter['speaker'],
                 sa_label_list, log_utter['utter_index'], context_label)
            ]
            # test_utters += [(translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)]

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    # build vocabulary
    utters = [utter[0].split(' ') for utter in train_utters]
    ctx_utters = [utter[1].split(' ') for utter in train_utters]
    print("max context utter length: %d " %
          max([len(ctx_utter) for ctx_utter in ctx_utters]))
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_ctx_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)
    train_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters,
                                                     vocabulary)

    utters = [utter[0].split(' ') for utter in test_utters]
    ctx_utters = [utter[1].split(' ') for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)
    test_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary)

    # build labels
    sa_train_labels = [utter[3] for utter in train_utters]
    sa_test_labels = [utter[3] for utter in test_utters]
    sa_train_ctx_labels = [utter[5] for utter in train_utters]
    sa_test_ctx_labels = [utter[5] for utter in test_utters]

    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)
    train_ctx_labels = label_binarizer.transform(sa_train_ctx_labels)
    test_ctx_labels = label_binarizer.transform(sa_test_ctx_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[2].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[2].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[2].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[2].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_ctx_inputs = train_ctx_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]
    tourist_train_ctx_labels = train_ctx_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_ctx_inputs = train_ctx_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]
    guide_train_ctx_labels = train_ctx_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_ctx_inputs = test_ctx_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]
    tourist_test_ctx_labels = test_ctx_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_ctx_inputs = test_ctx_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]
    guide_test_ctx_labels = test_ctx_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 tourist_train_inputs, tourist_train_ctx_inputs,
                 tourist_train_labels, tourist_train_ctx_labels,
                 tourist_test_inputs, tourist_test_ctx_inputs,
                 tourist_test_labels, tourist_test_ctx_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 guide_train_inputs, guide_train_ctx_inputs,
                 guide_train_labels, guide_train_ctx_labels, guide_test_inputs,
                 guide_test_ctx_inputs, guide_test_labels,
                 guide_test_ctx_labels)

    print("")
コード例 #6
0
def train():

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # load the vocab and embedding files
            vocab_table, vocab, vocab_size = load_vocab(FLAGS.vocab_file)
            embeddings = load_embedding(FLAGS.embed_file, vocab)
            train_iterator, train_next_batch = get_iterator(
                FLAGS.train_data_file,
                vocab_table,
                FLAGS.batch_size,
                FLAGS.max_seq_len,
                padding=True)
            dev_iterator, dev_next_batch = get_iterator(FLAGS.dev_data_file,
                                                        vocab_table,
                                                        10000000,
                                                        FLAGS.max_seq_len,
                                                        padding=True)

            mode = tf.estimator.ModeKeys.TRAIN
            mymodel = model(vocab_size,
                            l2_reg_lambda=FLAGS.l2_reg_lambda,
                            mode=mode)

            global_step = tf.Variable(0, name="global_step", trainable=False)

            learning_rate = 0.001
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            grads_and_vars = optimizer.compute_gradients(mymodel.loss)
            # clip the gradient norms:
            cliped_gvs = [(tf.clip_by_value(grad, -1., 1.), var)
                          for grad, var in grads_and_vars]
            train_op = optimizer.apply_gradients(cliped_gvs,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            # timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, tf.flags.FLAGS.model + "_runs"))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss
            loss_summary = tf.summary.scalar("loss", mymodel.loss)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            def train_step():
                """
                A single training step
                """
                [batch] = sess.run([train_next_batch])
                feed_dict = {
                    mymodel.tokens: batch['tokens'],
                    mymodel.surf_features: batch['features'],
                    mymodel.input_y: batch['scores'],
                    mymodel.batchsize: batch['tokens'].shape[0]
                }
                _, step, summaries, loss = sess.run(
                    [train_op, global_step, train_summary_op, mymodel.loss],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}".format(time_str, step, loss))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(step, writer=None):
                """
                Evaluates model on a dev set
                """
                sess.run(dev_iterator.initializer)
                while True:
                    try:
                        [batch] = sess.run([dev_next_batch])
                        feed_dict = {
                            mymodel.tokens: batch['tokens'],
                            mymodel.surf_features: batch['features'],
                            mymodel.input_y: batch['scores'],
                            mymodel.batchsize: batch['tokens'].shape[0]
                        }
                        summaries, loss = sess.run(
                            [dev_summary_op, mymodel.loss], feed_dict)
                        print('--- dev loss: ', loss)
                        if writer:
                            writer.add_summary(summaries, step)
                    except tf.errors.OutOfRangeError:
                        print("End of dataset")
                        break
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}".format(time_str, step, loss))
                if writer:
                    writer.add_summary(summaries, step)

            # Initialize all variables
            init_ops = [
                tf.global_variables_initializer(),
                tf.local_variables_initializer(),
                tf.tables_initializer()
            ]
            sess.run(init_ops)
            for epoch in range(FLAGS.num_epochs):
                # initialize going through dataset
                sess.run(train_iterator.initializer)
                while True:
                    try:
                        train_step()
                        current_step = tf.train.global_step(sess, global_step)
                        # evaluate on dev set
                        if current_step % FLAGS.evaluate_every == 0:
                            print("\nEvaluation:")
                            dev_step(current_step, writer=dev_summary_writer)
                            print("")

                        if current_step % FLAGS.checkpoint_every == 0:
                            path = saver.save(sess,
                                              checkpoint_prefix,
                                              global_step=current_step)
                            print(
                                "Saved model checkpoint to {}\n".format(path))
                    except tf.errors.OutOfRangeError:
                        print("End of dataset")
                        break
                print('-' * 100)
コード例 #7
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)
    ctx_len = int(params['context_length'])

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')
    transcript_contexts = []
    for call in trainset:
        for i, (log_utter, translations, label_utter) in enumerate(call):
            transcript = data_helpers.tokenize_and_lower(
                log_utter['transcript'])
            transcript_contexts += [transcript]

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            # train_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])]
            train_utters += [(transcript, log_utter['speaker'], sa_label_list,
                              log_utter['utter_index'])]

    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    transcript_contexts = []
    for call in testset:
        for i, (log_utter, translations, label_utter) in enumerate(call):
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''
            transcript_contexts += [translation]

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            # test_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])]
            test_utters += [(translation, log_utter['speaker'], sa_label_list,
                             log_utter['utter_index'])]

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [utter[0].split(' ') for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [utter[0].split(' ') for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # make windowed input data as context
    train_inputs = data_helpers.build_windowed_input(train_inputs, ctx_len)
    test_inputs = data_helpers.build_windowed_input(test_inputs, ctx_len)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer,
                          tourist_train_inputs, tourist_train_labels,
                          tourist_test_inputs, tourist_test_labels)

    run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer,
                          guide_train_inputs, guide_train_labels,
                          guide_test_inputs, guide_test_labels)

    print("")
コード例 #8
0
def main(argv):
    parser = argparse.ArgumentParser(description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='')
    parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='')
    parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide',  'tourist'], required=True,  help='speaker')

    args = parser.parse_args()
    threshold_predictor = None

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            transcript = data_helpers.tokenize_and_lower(log_utter['transcript'])

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list = sorted(set(sa_label_list))
            train_utters += [(transcript, log_utter['speaker'], sa_label_list)]
    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            try:
                translation = data_helpers.tokenize_and_lower(translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list = sorted(set(sa_label_list))
            test_utters += [(translation, log_utter['speaker'], sa_label_list)]

    pprint(train_utters[:2])
    pprint(test_utters[:2])

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)
    num_epochs = int(params['num_epochs'])
    validation_split = float(params['validation_split'])
    batch_size = int(params['batch_size'])
    multilabel = params['multilabel']=="true"

    # build vocabulary
    sents = [utter[0].split(' ') for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_sents = data_helpers.pad_sentences(sents, max_sent_len)
    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_sents)
    print("vocabulary size: %d" % len(vocabulary))
    # params['max_sent_len'] = max_sent_len

    # build inputs
    train_inputs = data_helpers.build_input_data(pad_sents, vocabulary)

    test_sents = [utter[0].split(' ') for utter in test_utters]
    test_pad_sents = data_helpers.pad_sentences(test_sents, max_sent_len)
    test_inputs = data_helpers.build_input_data(test_pad_sents, vocabulary)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels+sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split and shuffle data
    indices = np.arange(train_inputs.shape[0])
    np.random.shuffle(indices)
    train_inputs = train_inputs[indices]
    train_labels = train_labels[indices]
    num_validation = int(validation_split * train_inputs.shape[0])

    # x_train = train_inputs[:-num_validation]
    # y_train = train_labels[:-num_validation]
    # x_val = train_inputs[-num_validation:]
    # y_val = train_labels[-num_validation:]
    x_train = train_inputs
    y_train = train_labels

    x_test = test_inputs
    y_test = test_labels

    # construct a pytorch data_loader
    x_train = torch.from_numpy(x_train).long()
    y_train = torch.from_numpy(y_train).float()
    dataset_tensor = data_utils.TensorDataset(x_train, y_train)
    train_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=True, num_workers=4,
                                         pin_memory=False)

    x_test = torch.from_numpy(x_test).long()
    y_test = torch.from_numpy(y_test).long()
    dataset_tensor = data_utils.TensorDataset(x_test, y_test)
    test_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=False, num_workers=4,
                                         pin_memory=False)


    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    # load model
    model = SluConvNet(params, embedding_matrix, len(vocabulary), y_train.shape[1])

    if torch.cuda.is_available():
        model = model.cuda()
    learning_rate = float(params['learning_rate'])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    loss_fn = nn.MultiLabelSoftMarginLoss()
    # loss_fn = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()   # set the model to training mode (apply dropout etc)
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = autograd.Variable(inputs), autograd.Variable(labels)
            if torch.cuda.is_available():
                inputs, labels = inputs.cuda(), labels.cuda()

            preds = model(inputs)
            if torch.cuda.is_available():
                preds = preds.cuda()

            loss = loss_fn(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                print("current loss: %.4f" % loss)

        model.eval()        # set the model to evaluation mode
        # if threshold_predictor is None:
        threshold_predictor = train_threshold(model, train_loader, y_train.numpy())
        # count_predictor = train_count(model, train_loader, y_train.numpy())
        true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel, threshold_predictor)
        # true_acts, pred_acts, metrics = evaluate_count(model, label_binarizer, test_loader, y_test, multilabel, count_predictor)
        print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2]))

    # end of training
    true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel)
    print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2]))

    with open(("pred_result_%s.txt" % args.roletype), "w") as f:
        for pred_act, true_act in zip(pred_acts, true_acts):
            f.write("pred: %s\ntrue: %s\n\n" % (', '.join(pred_act), ', '.join(true_act)))
コード例 #9
0
ファイル: slu_cn_cnn.py プロジェクト: hkhpub/torch-dstc5
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--devset',
                        dest='devset',
                        action='store',
                        metavar='DEVSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    devset = dataset_walker.dataset_walker(args.devset,
                                           dataroot=args.dataroot,
                                           labels=True,
                                           translations=True)
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset(
        trainset, devset, testset)

    train_utters += dev_utters

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [[char for char in utter[0]] for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [[char for char in utter[0]] for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 tourist_train_inputs, tourist_train_labels,
                 tourist_test_inputs, tourist_test_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 guide_train_inputs, guide_train_labels, guide_test_inputs,
                 guide_test_labels)

    print("")
コード例 #10
0
logger.addHandler(console)
logger.addHandler(handler)

loss_save = 100.0
patience = 0

#word_index = data_helpers.load_wordindex("./conf/char.tsv")
word_index = data_helpers.load_wordindex("./conf/word.tsv")

sent_end_id = word_index["</s>"]

#train_data = data_helpers.load_data(open(FLAGS.traindata_file, "r").readlines(), word_index)
query_list, candidate_list, labels, test_data = data_helpers.load_data(
    open(FLAGS.testdata_file, "r").readlines(), word_index)

embedding_mat = data_helpers.load_embedding(FLAGS.pretrain_embeddingfile,
                                            FLAGS.embedding_dim)
assert len(word_index) == len(embedding_mat)
embedding_mat = np.array(embedding_mat, dtype=np.float32)

print "embedding_mat.shape"
print embedding_mat.shape

# Training
model_ckpt_path = os.path.join(FLAGS.model_ckpt, "model")
logger.info("logger test")


def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
  Note that this function provides a synchronization point across all towers.
  Args:
コード例 #11
0
ファイル: train_nn.py プロジェクト: manoochehr1/coref_cnn
    temp = FLAGS.checkpoint
    restore = FLAGS.restore
    FLAGS = pk.load(open(FLAGS.checkpoint + "/FLAGS", "rb"))
    FLAGS.checkpoint = temp
    FLAGS.restore = restore
    starting_epoch = FLAGS.num_epochs
    FLAGS.num_epochs = temp_num_epochs
    vocabulary = pk.load(open(FLAGS.checkpoint + "/vocabulary", 'rb'))
    vocabulary_inv = pk.load(open(FLAGS.checkpoint + "/vocabulary_inv", "rb"))

print("\nparameters:")
for attr, value in sorted(FLAGS.__dict__.items()):
    print("  {} = {}".format(attr, value))

# Load embeddings
pretrained_embedding, FLAGS.embedding_dim = data_helpers.load_embedding(
    vocabulary, FLAGS.pretrained_embedding, FLAGS.embedding_dim)
print("  dim. of word vector by setting", FLAGS.embedding_dim)

# Training
session_conf = tf.ConfigProto(
    allow_soft_placement=FLAGS.allow_soft_placement,
    log_device_placement=FLAGS.log_device_placement,
    gpu_options=tf.GPUOptions(
        per_process_gpu_memory_fraction=FLAGS.gpu_memory))
with tf.Session(config=session_conf) as sess:
    cnn = textNN(
        sequence_length_ment1=FLAGS.sequence_length_ment1,
        sequence_length_ment2=FLAGS.sequence_length_ment2,
        sequence_length_sents_ment1=FLAGS.sequence_length_sents_ment1,
        sequence_length_sents_ment2=FLAGS.sequence_length_sents_ment2,
        sequence_length_add_ment1=FLAGS.sequence_length_add_ment1,