Exemple #1
0
    def get_slot(self, sentence):
        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            self.vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(
            self.vocab_slot)

        jieba.load_userdict("../data_resource/doctor_dict.txt")
        jieba.load_userdict("../data_resource/disease_dict.txt")
        jieba.load_userdict("../data_resource/division_dict.txt")
        jieba.load_userdict("../data_resource/week_dict.txt")
        jieba.load_userdict("../data_resource/other_dict.txt")

        model = load_model(model_file)

        _WORD_FILTER = re.compile("([.,!?\"':;)(])")
        sentence = _WORD_FILTER.sub('', sentence)
        if not sentence.isalpha():
            return ("sentence should be words!")
        seg_gen = list(jieba.cut(sentence, cut_all=False))
        _sentence = " ".join(seg_gen)
        # Get token-ids for the input sentence.
        token_ids = data_helper.sentence_to_token_ids(
            tf.compat.as_bytes(_sentence), w2id_sentence)
        # Add GO symbol at the end of sentence
        if data_helper.GO_ID not in token_ids:
            token_ids.append(data_helper.GO_ID)
        pred = model.predict_on_batch(np.array(token_ids)[np.newaxis, :])
        _pred = np.argmax(pred, -1)[0].tolist()
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_helper.EOS_ID in _pred:
            _pred = _pred[:_pred.index(data_helper.EOS_ID)]
        slot_list = [
            tf.compat.as_str(id2w_slot[slot_pred]) for slot_pred in _pred
        ]

        slot_dictionary = {
            'disease': '',
            'division': '',
            'doctor': '',
            'time': ''
        }
        for index, item in enumerate(slot_list):
            if item == 'b-disease':
                slot_dictionary['disease'] = seg_gen[index]
            elif item == 'b-division':
                slot_dictionary['division'] = seg_gen[index]
            elif item == 'b-doctor':
                slot_dictionary['doctor'] = seg_gen[index]
            elif item == 'b-time':
                slot_dictionary['time'] = seg_gen[index]
        return slot_dictionary
Exemple #2
0
def train():
    print('Applying Parameters:')
    for k, v in FLAGS.__dict__['__flags'].items():
        print('%s: %s' % (k, str(v)))
    print("Preparing data in %s" % FLAGS.data_dir)
    vocab_path = ''
    tag_vocab_path = ''
    label_vocab_path = ''
    in_seq_train, out_seq_train, label_train, in_seq_dev, out_seq_dev, label_dev, in_seq_test, out_seq_test, label_test, vocab_path, tag_vocab_path, label_vocab_path = data_helper.prepare_multi_task_data(
        FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size)

    result_dir = FLAGS.train_dir + '/test_results'
    if not os.path.isdir(result_dir):
        os.makedirs(result_dir)

    current_taging_valid_out_file = result_dir + '/tagging.valid.hyp.txt'
    current_taging_test_out_file = result_dir + '/tagging.test.hyp.txt'

    vocab, rev_vocab = data_helper.initialize_vocabulary(vocab_path)
    tag_vocab, rev_tag_vocab = data_helper.initialize_vocabulary(
        tag_vocab_path)
    label_vocab, rev_label_vocab = data_helper.initialize_vocabulary(
        label_vocab_path)

    with tf.Session() as sess:
        # Create model.
        print("Max sequence length: %d." % _buckets[0][0])
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))

        model, model_test = create_model(sess, len(vocab), len(tag_vocab),
                                         len(label_vocab))
        print(
            "Creating model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d."
            % (len(vocab), len(tag_vocab), len(label_vocab)))

        # Read data into buckets and compute their sizes.
        print("Reading train/valid/test data (training set limit: %d)." %
              FLAGS.max_train_data_size)
        dev_set = read_data(in_seq_dev, out_seq_dev, label_dev)
        test_set = read_data(in_seq_test, out_seq_test, label_test)
        train_set = read_data(in_seq_train, out_seq_train, label_train)
        train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))

        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in xrange(len(train_bucket_sizes))
        ]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0

    best_valid_score = 0
    best_test_score = 0
    while model.global_step.eval() < FLAGS.max_training_steps:
        random_number_01 = np.random.random_sample()
        bucket_id = min([
            i for i in xrange(len(train_buckets_scale))
            if train_buckets_scale[i] > random_number_01
        ])

        # Get a batch and make a step.
        start_time = time.time()
        encoder_inputs, tags, tag_weights, batch_sequence_length, labels = model.get_batch(
            train_set, bucket_id)
        if task['joint'] == 1:
            _, step_loss, tagging_logits, classification_logits = model.joint_step(
                sess, encoder_inputs, tags, tag_weights, labels,
                batch_sequence_length, bucket_id, False)
        elif task['tagging'] == 1:
            _, step_loss, tagging_logits = model.tagging_step(
                sess, encoder_inputs, tags, tag_weights, batch_sequence_length,
                bucket_id, False)
        elif task['intent'] == 1:
            _, step_loss, classification_logits = model.classification_step(
                sess, encoder_inputs, labels, batch_sequence_length, bucket_id,
                False)

        step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
        loss += step_loss / FLAGS.steps_per_checkpoint
        current_step += 1

        # Once in a while, we save checkpoint, print statistics, and run evals.
        if current_step % FLAGS.steps_per_checkpoint == 0:
            perplexity = math.exp(loss) if loss < 300 else float('inf')
            print("global step %d step-time %.2f. Training perplexity %.2f" %
                  (model.global_step.eval(), step_time, perplexity))
            sys.stdout.flush()
            # Save checkpoint and zero timer and loss.
            checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
            model.saver.save(sess,
                             checkpoint_path,
                             global_step=model.global_step)
            step_time, loss = 0.0, 0.0

            def run_valid_test(data_set, mode):  # mode: Eval, Test
                # Run evals on development/test set and print the accuracy.
                word_list = list()
                ref_tag_list = list()
                hyp_tag_list = list()
                ref_label_list = list()
                hyp_label_list = list()
                correct_count = 0
                accuracy = 0.0
                tagging_eval_result = dict()
                for bucket_id in xrange(len(_buckets)):
                    eval_loss = 0.0
                    count = 0
                    for i in xrange(len(data_set[bucket_id])):
                        count += 1
                        encoder_inputs, tags, tag_weights, sequence_length, labels = model_test.get_one(
                            data_set, bucket_id, i)
                        tagging_logits = []
                        classification_logits = []
                        if task['joint'] == 1:
                            _, step_loss, tagging_logits, classification_logits = model_test.joint_step(
                                sess, encoder_inputs, tags, tag_weights,
                                labels, sequence_length, bucket_id, True)
                        elif task['tagging'] == 1:
                            _, step_loss, tagging_logits = model_test.tagging_step(
                                sess, encoder_inputs, tags, tag_weights,
                                sequence_length, bucket_id, True)
                        elif task['intent'] == 1:
                            _, step_loss, classification_logits = model_test.classification_step(
                                sess, encoder_inputs, labels, sequence_length,
                                bucket_id, True)
                        eval_loss += step_loss / len(data_set[bucket_id])
                        hyp_label = None
                        if task['intent'] == 1:
                            ref_label_list.append(
                                rev_label_vocab[labels[0][0]])
                            hyp_label = np.argmax(classification_logits[0], 0)
                            hyp_label_list.append(rev_label_vocab[hyp_label])
                            if labels[0] == hyp_label:
                                correct_count += 1
                        if task['tagging'] == 1:
                            word_list.append([
                                rev_vocab[x[0]]
                                for x in encoder_inputs[:sequence_length[0]]
                            ])
                            ref_tag_list.append([
                                rev_tag_vocab[x[0]]
                                for x in tags[:sequence_length[0]]
                            ])
                            hyp_tag_list.append([
                                rev_tag_vocab[np.argmax(x)]
                                for x in tagging_logits[:sequence_length[0]]
                            ])

                accuracy = float(correct_count) * 100 / count
                if task['intent'] == 1:
                    print("  %s accuracy: %.2f %d/%d" %
                          (mode, accuracy, correct_count, count))
                    sys.stdout.flush()
                if task['tagging'] == 1:
                    if mode == 'Eval':
                        taging_out_file = current_taging_valid_out_file
                    elif mode == 'Test':
                        taging_out_file = current_taging_test_out_file
                    tagging_eval_result = conlleval(hyp_tag_list, ref_tag_list,
                                                    word_list, taging_out_file)
                    print("  %s f1-score: %.2f" %
                          (mode, tagging_eval_result['f1']))
                    sys.stdout.flush()
                return accuracy, tagging_eval_result

            # valid
            valid_accuracy, valid_tagging_result = run_valid_test(
                dev_set, 'Eval')
            if task['tagging'] == 1 and valid_tagging_result[
                    'f1'] > best_valid_score:
                best_valid_score = valid_tagging_result['f1']
                # save the best output file
                subprocess.call([
                    'mv', current_taging_valid_out_file,
                    current_taging_valid_out_file +
                    '.best_f1_%.2f' % best_valid_score
                ])
            # test, run test after each validation for development purpose.
            test_accuracy, test_tagging_result = run_valid_test(
                test_set, 'Test')
            if task['tagging'] == 1 and test_tagging_result[
                    'f1'] > best_test_score:
                best_test_score = test_tagging_result['f1']
                # save the best output file
                subprocess.call([
                    'mv', current_taging_test_out_file,
                    current_taging_test_out_file +
                    '.best_f1_%.2f' % best_test_score
                ])
Exemple #3
0
    def decode(self, sentence=None):
        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            self.vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(
            self.vocab_slot)

        jieba.load_userdict("../data_resource/doctor_dict.txt")
        jieba.load_userdict("../data_resource/disease_dict.txt")
        jieba.load_userdict("../data_resource/division_dict.txt")
        jieba.load_userdict("../data_resource/other_dict.txt")

        model = load_model(model_file)

        if sentence == None:
            # Decode from standard input.
            sys.stdout.write("> ")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
            while sentence:
                seg_gen = jieba.cut(sentence, cut_all=False)
                _sentence = " ".join(seg_gen)
                # Get token-ids for the input sentence.
                token_ids = data_helper.sentence_to_token_ids(
                    tf.compat.as_bytes(_sentence), w2id_sentence)
                print(token_ids)
                # Add GO symbol at the end of sentence
                if data_helper.GO_ID not in token_ids:
                    token_ids.append(data_helper.GO_ID)
                pred = model.predict_on_batch(
                    np.array(token_ids)[np.newaxis, :])
                _pred = np.argmax(pred, -1)[0].tolist()
                # If there is an EOS symbol in outputs, cut them at that point.
                if data_helper.EOS_ID in _pred:
                    _pred = _pred[:_pred.index(data_helper.EOS_ID)]
                print(" ".join([
                    tf.compat.as_str(id2w_slot[slot_pred])
                    for slot_pred in _pred
                ]))
                print("> ", end="")
                sys.stdout.flush()
                sentence = sys.stdin.readline()

        elif sentence.isalpha():
            seg_gen = jieba.cut(sentence, cut_all=False)
            _sentence = " ".join(seg_gen)
            # Get token-ids for the input sentence.
            token_ids = data_helper.sentence_to_token_ids(
                tf.compat.as_bytes(_sentence), w2id_sentence)
            # Add GO symbol at the end of sentence
            if data_helper.GO_ID not in token_ids:
                token_ids.append(data_helper.GO_ID)
            pred = model.predict_on_batch(np.array(token_ids)[np.newaxis, :])
            _pred = np.argmax(pred, -1)[0].tolist()
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_helper.EOS_ID in _pred:
                _pred = _pred[:_pred.index(data_helper.EOS_ID)]
            return " ".join([
                tf.compat.as_str(id2w_slot[slot_pred]) for slot_pred in _pred
            ])

        else:
            raise ValueError('sentence should be string!')
    def train(self):
        # Prepare data
        sentence_train, slot_train, sentence_dev, slot_dev, vocab_sentence,\
            vocab_slot = data_helper.prepare_data(
                                    "data",
                                    sentence_training_file,
                                    slot_training_file,
                                    sentence_developing_file,
                                    slot_developing_file,
                                    from_vocabulary_size=2000,
                                    to_vocabulary_size=2000,
                                    tokenizer=None)
        sentence_developing, slot_devloping = data_helper.read_data(
            sentence_dev, slot_dev, max_size=None)
        sentence_training, slot_training = data_helper.read_data(
            sentence_train, slot_train, max_size=None)

        ## TODO:
        #sentence_training, slot_training = sentence_training[:1000],\
        #    slot_training[:1000]

        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(vocab_slot)

        # For conlleval script
        words_train = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_training
        ]
        labels_train = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_training
        ]
        words_val = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_developing
        ]
        labels_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_devloping
        ]

        # Define model
        n_vocab = len(w2id_sentence)
        n_classes = len(w2id_slot)

        #model = Sequential()
        #model.add(Embedding(n_vocab,100))
        #model.add(Convolution1D(128, 5, border_mode='same', activation='relu'))
        #model.add(Dropout(0.25))
        #model.add(GRU(100,return_sequences=True))
        #model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
        #model.compile('rmsprop', 'categorical_crossentropy')

        ## Training
        ##n_epochs = 30
        #n_epochs = 1

        train_f_scores = []
        val_f_scores = []
        best_val_f1 = 0

        #print("Training =>")
        #train_pred_label = []
        #avgLoss = 0

        #for i in range(n_epochs):
        #    print("Training epoch {}".format(i))

        #    bar = progressbar.ProgressBar(max_value=len(sentence_training))
        #    for n_batch, sent in bar(enumerate(sentence_training)):
        #        label = slot_training[n_batch]
        #        # Make labels one hot
        #        label = np.eye(n_classes)[label][np.newaxis, :]
        #        # View each sentence as a batch
        #        sent = sent[np.newaxis, :]

        #        if sent.shape[1] > 1: #ignore 1 word sentences
        #            loss = model.train_on_batch(sent, label)
        #            avgLoss += loss

        #        pred = model.predict_on_batch(sent)
        #        pred = np.argmax(pred, -1)[0]
        #        train_pred_label.append(pred)

        #    avgLoss = avgLoss/n_batch

        #    predword_train = [list(map(lambda x: id2w_slot[x].decode('utf8'), y))
        #                      for y in train_pred_label]
        #    con_dict = conlleval(predword_train, labels_train,
        #                         words_train, 'measure.txt')
        #    train_f_scores.append(con_dict['f1'])
        #    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
        #        avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
        #    # Save model
        #    model.save(filepath_model)
        #    gc.collect()

        print("Validating =>")
        from keras.models import load_model
        model = load_model(filepath_model)

        labels_pred_val = []
        avgLoss = 0

        bar = progressbar.ProgressBar(max_value=len(sentence_developing))
        for n_batch, sent in bar(enumerate(sentence_developing)):
            label = slot_devloping[n_batch]
            label = np.eye(n_classes)[label][np.newaxis, :]
            sent = sent[np.newaxis, :]

            if sent.shape[1] > 1:  #some bug in keras
                loss = model.test_on_batch(sent, label)
                avgLoss += loss

            pred = model.predict_on_batch(sent)
            pred = np.argmax(pred, -1)[0]
            labels_pred_val.append(pred)

        avgLoss = avgLoss / n_batch
        gc.collect()

        predword_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in labels_pred_val
        ]
        con_dict = conlleval(predword_val, labels_val, words_val,
                             'measure.txt')
        val_f_scores.append(con_dict['f1'])
        print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
            avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

        if con_dict['f1'] > best_val_f1:
            best_val_f1 = con_dict['f1']
            print('here')
            with open('model_architecture.json', 'w') as outf:
                outf.write(model.to_json())
            model.save_weights('best_model_weights.h5', overwrite=True)
            print("Best validation F1 score = {}".format(best_val_f1))
        print()
    def train(self):
        sentence_developing, slot_devloping = data_helper.read_data(
            self.sentence_dev, self.slot_dev, max_size=None)
        sentence_training, slot_training = data_helper.read_data(
            self.sentence_train, self.slot_train, max_size=None)

        # Make toy data; comment this block to train on the full dataset
        #n_toy = 1000
        #sentence_training, slot_training = sentence_training[:n_toy],\
        #    slot_training[:n_toy]
        #sentence_developing, slot_devloping = sentence_developing[:round(n_toy/2)],\
        #    slot_devloping[:round(n_toy/2)]

        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            self.vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(
            self.vocab_slot)

        # For conlleval script
        words_train = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_training
        ]
        labels_train = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_training
        ]
        words_val = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_developing
        ]
        labels_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_devloping
        ]

        # Define model
        n_vocab = len(w2id_sentence)
        n_classes = len(w2id_slot)

        model = Sequential()
        model.add(Embedding(n_vocab, 100))
        model.add(Convolution1D(128, 5, border_mode='same', activation='relu'))
        model.add(Dropout(0.25))
        model.add(GRU(100, return_sequences=True))
        model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
        model.compile('rmsprop', 'categorical_crossentropy')

        # Training
        #n_epochs = 30
        n_epochs = 1

        train_f_scores = []
        val_f_scores = []
        best_val_f1 = 0

        print("Training =>")
        train_pred_label = []
        avgLoss = 0

        for i in range(n_epochs):
            print("Training epoch {}".format(i))

            bar = progressbar.ProgressBar(max_value=len(sentence_training))
            for n_batch, sent in bar(enumerate(sentence_training)):
                label = slot_training[n_batch]
                # Make labels one hot
                label = np.eye(n_classes)[label][np.newaxis, :]
                # View each sentence as a batch
                sent = sent[np.newaxis, :]

                if sent.shape[1] > 1:  #ignore 1 word sentences
                    loss = model.train_on_batch(sent, label)
                    avgLoss += loss

                pred = model.predict_on_batch(sent)
                pred = np.argmax(pred, -1)[0]
                train_pred_label.append(pred)

            avgLoss = avgLoss / n_batch

            predword_train = [
                list(map(lambda x: id2w_slot[x].decode('utf8'), y))
                for y in train_pred_label
            ]
            con_dict = conlleval(predword_train, labels_train, words_train,
                                 'measure.txt')
            train_f_scores.append(con_dict['f1'])
            print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
                avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
            # Save model
            model.save(model_file)

            print("Validating =>")

            labels_pred_val = []
            avgLoss = 0

            bar = progressbar.ProgressBar(max_value=len(sentence_developing))
            for n_batch, sent in bar(enumerate(sentence_developing)):
                label = slot_devloping[n_batch]
                label = np.eye(n_classes)[label][np.newaxis, :]
                sent = sent[np.newaxis, :]

                if sent.shape[1] > 1:  #some bug in keras
                    loss = model.test_on_batch(sent, label)
                    avgLoss += loss

                pred = model.predict_on_batch(sent)
                pred = np.argmax(pred, -1)[0]
                labels_pred_val.append(pred)

            avgLoss = avgLoss / n_batch

            predword_val = [
                list(map(lambda x: id2w_slot[x].decode('utf8'), y))
                for y in labels_pred_val
            ]
            con_dict = conlleval(predword_val, labels_val, words_val,
                                 'measure.txt')
            val_f_scores.append(con_dict['f1'])
            print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
                avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

            if con_dict['f1'] > best_val_f1:
                best_val_f1 = con_dict['f1']
                with open('model_architecture.json', 'w') as outf:
                    outf.write(model.to_json())
                model.save_weights('best_model_weights.h5', overwrite=True)
                print("Best validation F1 score = {}".format(best_val_f1))
            print()

            # Prevent from tensorflow bugs: BaseSession.__del__
            gc.collect()