コード例 #1
0
ファイル: model.py プロジェクト: et0803/sequence_tagging
    def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
        """
        Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of words. 
                A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """
        # perform padding of the given data
        if self.config.chars:
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)

        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if self.config.chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths

        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths
コード例 #2
0
    def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
        char_ids, word_ids = zip(*words)
        word_ids, sequence_lengths = pad_sequences(word_ids, 0)
        char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2)

        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths,
            self.char_ids: char_ids,
            self.word_lengths: word_lengths
        }

        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths
コード例 #3
0
    def out(self, sentences, out_file=None):
        '''
        :param sentences: 支持两种输入格式,1种是输入txt文件,一种是输入list
        :return:
        '''
        sentences_list = [[char for char in sen] for sen in sentences]
        sentences_list, sequences_len = pad_sequences(sentences_list,
                                                      self.maxLen)
        sentences_idx = sequences2idx(sentences_list, self.char2idx)
        sequences_len = [
            seq if seq <= self.maxLen else self.maxLen for seq in sequences_len
        ]

        if type(out_file) == str:
            fw = open(out_file, 'wt', encoding='utf-8')

        with tf.Session() as sess:
            self.saver.restore(sess, self.model_path + self.model_name)

            labels = [0] * len(sentences_idx)  # 这个是没用的
            pred_labels = []
            for (bat_sens, _, bat_seqs_len) in batch_yield(sentences_idx,
                                                           labels,
                                                           sequences_len,
                                                           bs=500):
                feed_dict = {
                    self.sentences: bat_sens,
                    self.sequences_len: bat_seqs_len,
                    self.dropout_keep_prob: 1.0
                }
                hidden_scores, transition_params = sess.run(
                    [self.hidden_scores, self.transition_params],
                    feed_dict=feed_dict)
                bat_labels = []
                for scocre, seq_len in zip(hidden_scores, bat_seqs_len):
                    labs, _ = viterbi_decode(scocre[:seq_len],
                                             transition_params)
                    bat_labels.append(list(labs))

                pred_labels += [[self.idx2tag[idx] for idx in labs]
                                for labs in bat_labels]

            result = []
            for one_lab, one_sen_str in zip(pred_labels, sentences):
                result.append(self.get_prediction(one_lab, one_sen_str))

            if type(out_file) == str:
                self._out_file(result, out_file)

            return result
コード例 #4
0
ファイル: model.py プロジェクト: Charaisk/BiLSTM-NER
    def train(self, batch_generator, max_steps, save_path, save_every_n,
              log_every_n):
        self.session = tf.Session()
        with self.session as sess:
            sess.run(tf.global_variables_initializer())
            # Train network
            step = 0
            for x, y in batch_generator:
                inputs, sequence_lengths = pad_sequences(x, pad_mark=0)
                targets, _ = pad_sequences(y, pad_mark=0)
                step += 1
                start = time.time()
                feed = {
                    self.inputs: inputs,
                    self.targets: targets,
                    self.sequence_lengths: sequence_lengths,
                    self.keep_prob: self.train_keep_prob
                }
                batch_loss, _ = sess.run([self.loss, self.optimizer],
                                         feed_dict=feed)

                end = time.time()
                # control the print lines
                if step % log_every_n == 0:
                    print('step: {}/{}... '.format(step, max_steps),
                          'loss: {:.4f}... '.format(batch_loss),
                          '{:.4f} sec/batch'.format((end - start)))
                if (step % save_every_n == 0):
                    self.saver.save(sess,
                                    os.path.join(save_path, 'model'),
                                    global_step=step)
                if step >= max_steps:
                    break
            self.saver.save(sess,
                            os.path.join(save_path, 'model'),
                            global_step=step)
コード例 #5
0
    def get_feed_dict(self, batch,  type='Train', dropout=None):
        # input is ids
        # batch_size = len(batch)  # dynamically calculate batch size

        if type == 'Train':
            input_ids, label_ids, sequence_lengths, input_mask = [], [], [], []
            if self.soft_masked:
                label_bio = []
            for data in batch:
                input_len = len(data['token_ids'])
                input_ids.append(data['token_ids'])
                input_mask.append([1]*input_len)
                label_ids.append(data['labels'])
                sequence_lengths.append(input_len)
                if self.soft_masked:
                    label_bio.append([self.tags_dict[x] for x in data['bio']])

            max_length = min(max(sequence_lengths), self.bert_max_len)
            # print(217,label_ids[0])
            feed = {self.bert_api.input_ids: pad_sequences(input_ids, max_length),
                    self.bert_api.input_mask: pad_sequences(input_mask, max_length),
                    self.labels: pad_sequences(label_ids, max_length),
                    self.bert_api.sequence_lengths: sequence_lengths,
                    self.bert_api.dropout: dropout}
            if self.soft_masked:
                feed[self.label_bio] = pad_sequences(label_bio,max_length)
            return feed, sequence_lengths
        elif type == 'Eval':
            input_ids, label_bio, sequence_lengths, input_mask = [], [], [], []
            for data in batch:
                input_len = len(data['token_ids'])
                # print(254, data['token_ids'], len(data['token_ids']))
                input_ids.append(data['token_ids'])
                input_mask.append([1]*input_len)
                label_bio.append(data['bio'][1:-1])
                sequence_lengths.append(input_len)
            max_length = min(max(sequence_lengths), self.bert_max_len)

            feed = {self.bert_api.input_ids: pad_sequences(input_ids, max_length),
                    self.bert_api.input_mask: pad_sequences(input_mask, max_length),
                    self.bert_api.sequence_lengths: sequence_lengths}
            return feed, input_ids, label_bio, sequence_lengths
        elif type == 'Pred':
            #to be finished
            sequence_lengths = [len(x) for x in batch]
            max_length = min(max(sequence_lengths), self.bert_max_len)
            feed = {self.bert_api.input_ids: pad_sequences(batch, max_length),
                    self.bert_api.sequence_lengths: sequence_lengths
                    }

            return feed, sequence_lengths
コード例 #6
0
    def get_feed_dict(self, words, lr=None, dropout=None, iob=None, mention_type=None, mentions=None, mention_size=None):
        """
        Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of words. 
                A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """
        # perform padding of the given data
        if self.config.chars:
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)

        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if self.config.chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout
        if iob is not None:
            feed[self.iob_type], _ = pad_sequences(iob, 0)
        mention_num = 0
        if mention_type is not None:
            feed[self.mention_type], _ = pad_sequences(mention_type, 0)

        if mentions is not None:
            feed[self.mention], mention_length = pad_sequences(mentions, pad_tok=0, nlevels=2)
            feed[self.mention_length] = mention_length
            feed[self.mention_size] = mention_size

        return feed, sequence_lengths
コード例 #7
0
    def get_feed_dict(self,
                      words,
                      mor_tags=None,
                      lex_tags=None,
                      labels=None,
                      lr=None,
                      dropout=None):
        """
        Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of words.
                A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """
        # perform padding of the given data
        if self.config.chars:
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids,
                                                   pad_tok=0,
                                                   nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)

        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if self.config.chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
            self.cnn_word_lengths = word_lengths

        if lex_tags is not None:
            lex_tags, _ = pad_sequences(lex_tags, 0)
            # add two hot code here
            batch_arr = []
            for b_i, sentence in enumerate(lex_tags):
                sentence_arr = []
                for w_i, each_word_lex in enumerate(sentence):

                    word_lex_hot = list([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

                    if isinstance(each_word_lex, str) and ',' in each_word_lex:
                        for word in each_word_lex.split(','):
                            word_idx = int(word)
                            word_lex_hot[word_idx] = 1.0
                    else:
                        word_lex_hot[each_word_lex] = 1.0
                    sentence_arr.append(word_lex_hot)
                batch_arr.append(sentence_arr)
            feed[self.lex_tags] = batch_arr

        if mor_tags is not None:
            mor_tags, _ = pad_sequences(mor_tags, 0)
            feed[self.mor_tags] = mor_tags

        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths
コード例 #8
0
    def get_feed_dict(self,
                      words,
                      pos=None,
                      labels=None,
                      lr=None,
                      dropout=None):
        """
        Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of words. 
                A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """

        # perform padding of the given data
        if self.config.chars:
            char_ids, word_ids = zip(*words)

            for _wi in word_ids:
                for _ww in _wi:
                    w = self.vocab_words[_ww]
                    w = w.split("/")
                    if len(w) > 2:
                        temp_w = ""
                        for _wd in w[:-1]:
                            temp_w += _wd
                        w = temp_w
                        print(w)
                    '''else:
                        w = w[0]'''
                    #print(self.processing_dics.get(self.vocab_words[_ww]))

            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            pos_ids, _ = pad_sequences(pos, 0)
            char_ids, word_lengths = pad_sequences(char_ids,
                                                   pad_tok=0,
                                                   nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)
            pos_ids, _ = pad_sequences(pos, 0)

        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }
        feed[self.pos_ids] = pos_ids

        if self.config.chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths

        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths
コード例 #9
0
ファイル: ner_model.py プロジェクト: utkrist/simple
 def get_feed_dict(self, words, labels=None, pred_flags=None, lr=None, dropout=None):
     feed, sequence_lengths = super().get_feed_dict(words, labels=labels, lr=lr, dropout=dropout)
     _pred_flags, _ = pad_sequences(pred_flags, 0)
     feed[self.parent.pred_flag] = _pred_flags
     return feed, sequence_lengths          
コード例 #10
0
ファイル: model.py プロジェクト: ericxsun/BiDTree
    def get_feed_dict(self,
                      words,
                      poss,
                      chunks,
                      labels=None,
                      btup_idx_list=None,
                      btup_words_list=None,
                      btup_depwords_list=None,
                      btup_deprels_list=None,
                      btup_depwords_length_list=None,
                      upbt_idx_list=None,
                      upbt_words_list=None,
                      upbt_depwords_list=None,
                      upbt_deprels_list=None,
                      upbt_depwords_length_list=None,
                      btup_formidx_list=None,
                      upbt_formidx_list=None,
                      lr=None,
                      dropout=None):
        """
        Given some data, pad it and build a feed dictionary
        """
        # perform padding of the given data
        if self.config.chars:
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = pad_sequences(word_ids, self.nwords,
                                                       self.max_sentence_size,
                                                       self.max_word_size)
            char_ids, word_lengths = pad_sequences(char_ids,
                                                   self.nchars,
                                                   self.max_sentence_size,
                                                   self.max_word_size,
                                                   nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, self.nwords,
                                                       self.max_sentence_size,
                                                       self.max_word_size)

        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }
        if self.config.chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
        if labels is not None:
            labels, _ = pad_sequences(labels, 2, self.max_sentence_size,
                                      self.max_word_size)
            feed[self.labels] = labels
        if lr is not None:
            feed[self.lr] = lr
        if dropout is not None:
            feed[self.dropout] = dropout

        # Begin using deps tree
        feed[self.tbatch_size] = len(btup_idx_list)
        if btup_idx_list is not None:
            btup_idx_list, _ = pad_sequences(btup_idx_list, -1,
                                             self.max_sentence_size)
            feed[self.btup_word_orders] = btup_idx_list
        if btup_words_list is not None:
            btup_words_list, _ = pad_sequences(btup_words_list, self.nwords,
                                               self.max_sentence_size)
            feed[self.btup_word_ids] = btup_words_list
        if btup_depwords_list is not None:
            btup_depwords_list, _ = pad_sequences(btup_depwords_list,
                                                  -1,
                                                  self.max_sentence_size,
                                                  self.max_btup_deps_len,
                                                  nlevels=2)
            feed[self.btup_deps_ids] = btup_depwords_list
        if btup_deprels_list is not None:
            btup_deprels_list, _ = pad_sequences(btup_deprels_list,
                                                 self.nrels,
                                                 self.max_sentence_size,
                                                 self.max_btup_deps_len,
                                                 nlevels=2)
            feed[self.btup_rels_ids] = btup_deprels_list
        if btup_depwords_length_list is not None:
            btup_depwords_length_list, _ = pad_sequences(
                btup_depwords_length_list, 0, self.max_sentence_size)
            feed[self.btup_deps_lens] = btup_depwords_length_list

        if upbt_idx_list is not None:
            upbt_idx_list, _ = pad_sequences(upbt_idx_list, -1,
                                             self.max_sentence_size)
            feed[self.upbt_word_orders] = upbt_idx_list
        if upbt_words_list is not None:
            upbt_words_list, _ = pad_sequences(upbt_words_list, self.nwords,
                                               self.max_sentence_size)
            feed[self.upbt_word_ids] = upbt_words_list
        if upbt_depwords_list is not None:
            upbt_depwords_list, _ = pad_sequences(upbt_depwords_list,
                                                  -1,
                                                  self.max_sentence_size,
                                                  self.max_upbt_deps_len,
                                                  nlevels=2)
            feed[self.upbt_deps_ids] = upbt_depwords_list
        if upbt_deprels_list is not None:
            upbt_deprels_list, _ = pad_sequences(upbt_deprels_list,
                                                 self.nrels,
                                                 self.max_sentence_size,
                                                 self.max_upbt_deps_len,
                                                 nlevels=2)
            feed[self.upbt_rels_ids] = upbt_deprels_list
        if upbt_depwords_length_list is not None:
            upbt_depwords_length_list, _ = pad_sequences(
                upbt_depwords_length_list, 0, self.max_sentence_size)
            feed[self.upbt_deps_lens] = upbt_depwords_length_list

        if btup_formidx_list is not None:
            btup_formidx_list, _ = pad_sequences(btup_formidx_list, -1,
                                                 self.max_sentence_size)
            feed[self.btup_formidxs] = btup_formidx_list
        if upbt_formidx_list is not None:
            upbt_formidx_list, _ = pad_sequences(upbt_formidx_list, -1,
                                                 self.max_sentence_size)
            feed[self.upbt_formidxs] = upbt_formidx_list

        return feed, sequence_lengths
コード例 #11
0
ファイル: main.py プロジェクト: Liguangchuang/base_knowledge
                                          args.idx2tag)
            f1 = evaluate(test_labels, pred_labels)
            print(f1)
            print(test_sentences[10:15])
            print('test:', test_labels[10:15])
            print('pred:', pred_labels[10:15])

    elif out_args.mode == 'demo':
        print('————————————demo————————————')
        model = biLstm_crf_model(args)
        model.load_weights(args.model_path + args.model_name + '.h5')
        while True:
            print('pleace input a sentence:')
            one_sentence_str = input()
            if one_sentence_str != '':
                one_sentence_list = [[char for char in one_sentence_str]]
                one_sentence_list, seq_len = pad_sequences(
                    one_sentence_list, args.maxLen)
                one_sentence_idx = sequences2idx(one_sentence_list,
                                                 args.char2idx)  ##二维列表,注意

                predictions = model.predict(
                    np.array(one_sentence_idx))  #这个是不是有点问题啊?转移矩阵去哪里啦?
                one_label = get_pred_labels(predictions, seq_len,
                                            args.idx2tag)[0]
                print(one_label)

                prediction_list = args.get_prediction(one_label,
                                                      one_sentence_str)
                print(prediction_list, '\n')
コード例 #12
0
# Hyperparameters
dim_word = 300
dim_char = 100
hidden_size_char = 100  # lstm on chars
hidden_size_lstm = 300  # lstm on word embeddings
nepochs = args.epochs
lr = 0.0105
lr_decay = 0.0005
batch_size = 10
dropout = 0.5

# Process training dataset
print('Creating training dataset...')
words, labels = list(minibatches(train, len(train)))[0]  # NOTE: len(train) will return entire dataset!
char_ids, word_ids = zip(*words)
word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=pad_tag)
char_ids, word_lengths = pad_sequences(char_ids, pad_tok=pad_tag, nlevels=2)
labels, _ = pad_sequences(labels, pad_tok=pad_tag)

# Convert word and char ids to np arrays; one-hot encode labels
char_ids_arr = np.array(char_ids)
word_ids_arr = np.array(word_ids)
labels_arr = np.array(labels)
labels_arr_one_hot = np.eye(n_labels)[labels_arr]

# Process validation dataset
print('Creating validation dataset...')
words_valid, labels_valid = list(minibatches(valid, len(valid)))[0]
char_ids_valid, word_ids_valid = zip(*words_valid)
word_ids_valid, sequence_lengths_valid = pad_sequences(word_ids_valid, pad_tok=pad_tag)
char_ids_valid, word_lengths_valid = pad_sequences(char_ids_valid, pad_tok=pad_tag, nlevels=2)
コード例 #13
0
    def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
        """
        Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of words. 
                A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """
        # perform padding of the given data
        if self.config.chars:
            #print (words[0])
            char_ids, pref_ids, suff_ids, pref_ids_2, suff_ids_2, pref_ids_4, suff_ids_4, word_ids = zip(
                *words)  ##, pref_ids, suff_ids,
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            #####################################################################
            pref_ids, sequence_lengths_pref = pad_sequences(pref_ids, 0)
            suff_ids, sequence_lengths_suff = pad_sequences(suff_ids, 0)
            pref_ids_2, sequence_lengths_pref_2 = pad_sequences(pref_ids_2, 0)
            suff_ids_2, sequence_lengths_suff_2 = pad_sequences(suff_ids_2, 0)

            pref_ids_4, sequence_lengths_pref_4 = pad_sequences(pref_ids_4, 0)
            suff_ids_4, sequence_lengths_suff_4 = pad_sequences(suff_ids_4, 0)
            #####################################################################
            char_ids, word_lengths = pad_sequences(
                char_ids, pad_tok=0, nlevels=2
            )  ##################################################################### Orig
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)

        # build feed dictionary
        feed = {  ########################### Same for suffix prefix
            self.word_ids: word_ids,
            self.sequence_lengths:
            sequence_lengths,  ###### The sequence length will be same for all these features: words, suffix, prefix, suffix_2 and suffix_3.
            self.pref_ids: pref_ids,
            self.sequence_lengths: sequence_lengths_pref,
            self.suff_ids: suff_ids,
            self.sequence_lengths: sequence_lengths_suff,
            self.pref_ids_2: pref_ids_2,
            self.sequence_lengths: sequence_lengths_pref_2,
            self.suff_ids_2: suff_ids_2,
            self.sequence_lengths: sequence_lengths_suff_2,
            self.pref_ids_4: pref_ids_4,
            self.sequence_lengths: sequence_lengths_pref_4,
            self.suff_ids_4: suff_ids_4,
            self.sequence_lengths: sequence_lengths_suff_4
        }

        if self.config.chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
            ###
        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths
コード例 #14
0
ファイル: test_data_utils.py プロジェクト: pencoa/Chinese-NER
def test_seq_padding():
    a = np.array([[1, 2, 3, 5], [2, 3, 2], [3, 1, 4, 1, 5, 9]])
    seq, length = pad_sequences(a, 0)
コード例 #15
0
def test():
    ## argument
    train_path = sys.argv[1]
    test_path = sys.argv[2]
    predict_path = sys.argv[3]
    model_name = sys.argv[4]
    char_embed_path = sys.argv[5]
    word_embed_path = sys.argv[6]
    pos_embed_path = sys.argv[7]
    dict_path = sys.argv[8]

    train_rate = 0.9
    max_char_ctx_len = 1160
    max_word_ctx_len = 680

    char_ctx_len = 1160
    char_qus_len = 240

    word_ctx_len = 400
    word_qus_len = 40

    word_char_len = 5

    char_embed_size = 128
    word_embed_size = 128
    pos_embed_size = 32
    hidden_size = 64
    model_size = 64

    max_epochs = 50
    batch_size = 8

    lr = 0.001
    drop_rate = 0.5
    recur_drop_rate = 0.0
    patience = 20

    ## load data
    print("load data")
    st = time.time()
    train_raw_data = data_utils.load_json_data(train_path)
    test_raw_data = data_utils.load_json_data(test_path)
    #    # load pos data
    #    train_gen_pos_data = data_utils.load_json_data(train_pos_path)
    #    test_gen_pos_data = data_utils.load_json_data(test_pos_path)
    # load embedding
    char_embedding = word2vec.Word2Vec.load(char_embed_path)
    word_embedding = word2vec.Word2Vec.load(word_embed_path)
    pos_embedding = word2vec.Word2Vec.load(pos_embed_path)
    et = time.time()
    print("cost time:", et - st)

    ## process data
    print("process data")
    st = time.time()
    train_data = data_utils.make_train_data(
        train_raw_data
    )  # data format: (id, context, question, answer_start, answer_end)
    test_data = data_utils.make_test_data(
        test_raw_data)  # data format: (id, context, question)
    train_context = [data[1] for data in train_data]
    train_question = [data[2] for data in train_data]
    train_char_answer_start = [data[3] for data in train_data]
    train_char_answer_end = [data[4] for data in train_data]
    #    train_context_poss = [data['context'] for data in train_gen_pos_data['data']]
    #    train_question_poss = [data['question'] for data in train_gen_pos_data['data']]
    test_id = [data[0] for data in test_data]
    test_context = [data[1] for data in test_data]
    test_question = [data[2] for data in test_data]
    #    test_context_poss = [data['context'] for data in test_gen_pos_data['data']]
    #    test_question_poss = [data['question'] for data in test_gen_pos_data['data']]
    del train_data
    del test_data
    et = time.time()
    print("cost time:", et - st)

    ## load vocabulary
    print("load vocabulary")
    st = time.time()
    char_vocab = data_utils.load_json_data('model_%s_char_vocab.json' %
                                           model_name)
    word_vocab = data_utils.load_json_data('model_%s_word_vocab.json' %
                                           model_name)
    pos_vocab = data_utils.load_json_data('model_%s_pos_vocab.json' %
                                          model_name)
    #    poss = train_context_poss + train_question_poss + test_context_poss + test_question_poss
    #    pos_vocab, rev_pos_vocab = data_utils.build_vocabulary_with_embedding(poss, pos_embedding)
    char_vocab_size = len(char_vocab)
    word_vocab_size = len(word_vocab)
    pos_vocab_size = len(pos_vocab)
    et = time.time()
    print("char vocab size:", char_vocab_size)
    print("word vocab size:", word_vocab_size)
    print("pos vocab size:", pos_vocab_size)
    print("cost time:", et - st)

    ## tokenize data
    print("tokenize data")
    st = time.time()
    train_context_chars = data_utils.tokenize_to_chars(train_context)
    train_question_chars = data_utils.tokenize_to_chars(train_question)
    test_context_chars = data_utils.tokenize_to_chars(test_context)
    test_question_chars = data_utils.tokenize_to_chars(test_question)
    train_context_words = data_utils.tokenize_to_words(train_context,
                                                       init_dict=True,
                                                       dict_path=dict_path)
    train_question_words = data_utils.tokenize_to_words(train_question,
                                                        init_dict=True,
                                                        dict_path=dict_path)
    test_context_words = data_utils.tokenize_to_words(test_context,
                                                      init_dict=True,
                                                      dict_path=dict_path)
    test_question_words = data_utils.tokenize_to_words(test_question,
                                                       init_dict=True,
                                                       dict_path=dict_path)
    train_context_poss = data_utils.tokenize_to_poss(train_context,
                                                     init_dict=True,
                                                     dict_path=dict_path)
    train_question_poss = data_utils.tokenize_to_poss(train_question,
                                                      init_dict=True,
                                                      dict_path=dict_path)
    test_context_poss = data_utils.tokenize_to_poss(test_context,
                                                    init_dict=True,
                                                    dict_path=dict_path)
    test_question_poss = data_utils.tokenize_to_poss(test_question,
                                                     init_dict=True,
                                                     dict_path=dict_path)
    et = time.time()
    print("cost time:", et - st)

    ## select data
    # select the data which sequence lengths satisfy length constraints
    print("select data")
    st = time.time()
    select_indices = data_utils.select_data_by_lengths(train_context_words,
                                                       train_question_words,
                                                       word_ctx_len,
                                                       word_qus_len)
    train_context_chars = [train_context_chars[i] for i in select_indices]
    train_context_words = [train_context_words[i] for i in select_indices]
    train_context_poss = [train_context_poss[i] for i in select_indices]
    train_question_chars = [train_question_chars[i] for i in select_indices]
    train_question_words = [train_question_words[i] for i in select_indices]
    train_question_poss = [train_question_poss[i] for i in select_indices]
    train_char_answer_start = [
        train_char_answer_start[i] for i in select_indices
    ]
    train_char_answer_end = [train_char_answer_end[i] for i in select_indices]
    et = time.time()
    print("cost time:", et - st)

    ## set answer
    # it should be done after tokenize sentences to words
    print("set answer")
    st = time.time()
    train_word_answer_start, train_word_answer_end = data_utils.set_word_answer(
        train_context_words, train_char_answer_start, train_char_answer_end,
        word_ctx_len)
    train_answer_start, train_answer_end = train_word_answer_start, train_word_answer_end
    et = time.time()
    print("cost time:", et - st)

    ## pad data
    print("pad data")
    st = time.time()
    # clip words to chars
    # it should be done after build vocab (add PAD)
    train_context_clip_chars = data_utils.clip_words_to_chars(
        train_context_words, word_char_len)
    train_question_clip_chars = data_utils.clip_words_to_chars(
        train_question_words, word_char_len)
    test_context_clip_chars = data_utils.clip_words_to_chars(
        test_context_words, word_char_len)
    test_question_clip_chars = data_utils.clip_words_to_chars(
        test_question_words, word_char_len)
    #    print("Debug: tarin_context_clip_chars[0]:")
    #    print(train_context_clip_chars[0])
    #    print("Debug: train_question_clip_chars[0]:")
    #    print(train_question_clip_chars[0])

    # padding
    train_context_pad_chars = data_utils.pad_sequences(
        train_context_clip_chars, word_ctx_len * word_char_len)
    train_question_pad_chars = data_utils.pad_sequences(
        train_question_clip_chars, word_qus_len * word_char_len)
    train_context_pad_words = data_utils.pad_sequences(train_context_words,
                                                       word_ctx_len)
    train_question_pad_words = data_utils.pad_sequences(
        train_question_words, word_qus_len)
    train_context_pad_poss = data_utils.pad_sequences(train_context_poss,
                                                      word_ctx_len)
    train_question_pad_poss = data_utils.pad_sequences(train_question_poss,
                                                       word_qus_len)
    test_context_pad_chars = data_utils.pad_sequences(
        test_context_clip_chars, word_ctx_len * word_char_len)
    test_question_pad_chars = data_utils.pad_sequences(
        test_question_clip_chars, word_qus_len * word_char_len)
    test_context_pad_words = data_utils.pad_sequences(test_context_words,
                                                      word_ctx_len)
    test_question_pad_words = data_utils.pad_sequences(test_question_words,
                                                       word_qus_len)
    test_context_pad_poss = data_utils.pad_sequences(test_context_poss,
                                                     word_ctx_len)
    test_question_pad_poss = data_utils.pad_sequences(test_question_poss,
                                                      word_qus_len)
    et = time.time()
    print("cost time:", et - st)
    ## make arrays
    print("make arrays")
    st = time.time()
    # map vocab to index
    #    print("Debug: train_context_pad_words[0]:")
    #    print(train_context_pad_words[0])
    #    print("Debug: train_question_pad_words[0]:")
    #    print(train_question_pad_words[0])
    train_context_char_indices = data_utils.map_vocabulary_index(
        train_context_pad_chars, char_vocab)
    train_question_char_indices = data_utils.map_vocabulary_index(
        train_question_pad_chars, char_vocab)
    train_context_word_indices = data_utils.map_vocabulary_index(
        train_context_pad_words, word_vocab)
    train_question_word_indices = data_utils.map_vocabulary_index(
        train_question_pad_words, word_vocab)
    train_context_pos_indices = data_utils.map_vocabulary_index(
        train_context_pad_poss, pos_vocab)
    train_question_pos_indices = data_utils.map_vocabulary_index(
        train_question_pad_poss, pos_vocab)
    test_context_char_indices = data_utils.map_vocabulary_index(
        test_context_pad_chars, char_vocab)
    test_question_char_indices = data_utils.map_vocabulary_index(
        test_question_pad_chars, char_vocab)
    test_context_word_indices = data_utils.map_vocabulary_index(
        test_context_pad_words, word_vocab)
    test_question_word_indices = data_utils.map_vocabulary_index(
        test_question_pad_words, word_vocab)
    test_context_pos_indices = data_utils.map_vocabulary_index(
        test_context_pad_poss, pos_vocab)
    test_question_pos_indices = data_utils.map_vocabulary_index(
        test_question_pad_poss, pos_vocab)
    # make one-hot label
    train_answer_start_onehot = data_utils.one_hot_encoding(
        train_answer_start, word_ctx_len)
    train_answer_end_onehot = data_utils.one_hot_encoding(
        train_answer_end, word_ctx_len)
    # to array
    # X1: context chars; X2: context words; X3: context poss;
    # X4: question chars; X5: question words; X6: question poss;
    # Y1: answer_start, Y2: answer_end
    train_X1 = np.array(train_context_char_indices, dtype=np.int32)
    train_X2 = np.array(train_context_word_indices, dtype=np.int32)
    train_X3 = np.array(train_context_pos_indices, dtype=np.int32)
    train_X4 = np.array(train_question_char_indices, dtype=np.int32)
    train_X5 = np.array(train_question_word_indices, dtype=np.int32)
    train_X6 = np.array(train_question_pos_indices, dtype=np.int32)
    train_Y1 = np.array(train_answer_start_onehot, dtype=np.int32)
    train_Y2 = np.array(train_answer_end_onehot, dtype=np.int32)
    train_word_ans1 = np.array(train_answer_start, dtype=np.int32)
    train_word_ans2 = np.array(train_answer_end, dtype=np.int32)
    train_ans1 = np.array(train_char_answer_start, dtype=np.int32)
    train_ans2 = np.array(train_char_answer_end, dtype=np.int32)
    test_X1 = np.array(test_context_char_indices, dtype=np.int32)
    test_X2 = np.array(test_context_word_indices, dtype=np.int32)
    test_X3 = np.array(test_context_pos_indices, dtype=np.int32)
    test_X4 = np.array(test_question_char_indices, dtype=np.int32)
    test_X5 = np.array(test_question_word_indices, dtype=np.int32)
    test_X6 = np.array(test_question_pos_indices, dtype=np.int32)
    # make embedding weight matrix
    word_embed_matrix = data_utils.make_embedding_matrix(
        word_embedding, word_vocab, word_embed_size)
    char_embed_matrix = data_utils.make_embedding_matrix(
        char_embedding, char_vocab, char_embed_size)
    pos_embed_matrix = data_utils.make_embedding_matrix(
        pos_embedding, pos_vocab, pos_embed_size)

    # delete data for releasing memory
    del train_context, train_question, test_context, test_question
    del train_context_chars, train_question_chars, test_context_chars, test_question_chars
    #    del train_context_words, train_question_words, test_context_words, test_question_words
    del train_context_clip_chars, train_question_clip_chars, test_context_clip_chars, test_question_clip_chars
    del train_context_char_indices, train_question_char_indices, test_context_char_indices, test_question_char_indices
    del train_context_word_indices, train_question_word_indices, test_context_word_indices, test_question_word_indices
    del train_context_pos_indices, train_question_pos_indices, test_context_pos_indices, test_question_pos_indices
    del train_word_answer_start, train_word_answer_end, train_char_answer_start, train_char_answer_end
    del train_answer_start_onehot, train_answer_end_onehot
    et = time.time()
    print("train shape:", train_X1.shape, train_X2.shape, train_X3.shape,
          train_X4.shape, train_X5.shape, train_X6.shape, train_Y1.shape,
          train_Y2.shape)
    print("test shape:", test_X1.shape, test_X2.shape, test_X3.shape,
          test_X4.shape, test_X5.shape, test_X6.shape)
    print("cost time:", et - st)

    ## XXX build model
    print("build model")
    st = time.time()
    # input layers
    # X1: context chars; X2: context words; X3: context poss;
    # X4: question chars; X5: question words; X6: question poss;
    # Y1: answer_start; Y2: answer_end
    var_x1_input = Input(shape=(word_ctx_len * word_char_len, ),
                         dtype=np.int32)
    var_x2_input = Input(shape=(word_ctx_len, ), dtype=np.int32)
    var_x3_input = Input(shape=(word_ctx_len, ), dtype=np.int32)
    var_x4_input = Input(shape=(word_qus_len * word_char_len, ),
                         dtype=np.int32)
    var_x5_input = Input(shape=(word_qus_len, ), dtype=np.int32)
    var_x6_input = Input(shape=(word_qus_len, ), dtype=np.int32)

    # embedding layers
    var_x1_embed = Embedding(
        input_dim=char_vocab_size,
        output_dim=char_embed_size,
        weights=[char_embed_matrix],
        input_length=word_ctx_len * word_char_len,
        trainable=False
    )(var_x1_input)  # shape: (None, ctx_length * word_length, char_embed_size)
    var_x2_embed = Embedding(
        input_dim=word_vocab_size,
        output_dim=word_embed_size,
        weights=[word_embed_matrix],
        input_length=word_ctx_len,
        trainable=False)(
            var_x2_input)  # shape: (None, ctx_length, word_embed_size)
    var_x3_embed = Embedding(
        input_dim=pos_vocab_size,
        output_dim=pos_embed_size,
        weights=[pos_embed_matrix],
        input_length=word_ctx_len,
        trainable=False)(
            var_x3_input)  # shape: (None, ctx_length, pos_embed_size)
    var_x4_embed = Embedding(
        input_dim=char_vocab_size,
        output_dim=char_embed_size,
        weights=[char_embed_matrix],
        input_length=word_qus_len * word_char_len,
        trainable=False
    )(var_x4_input)  # shape: (None, qus_length * word_length, char_embed_size)
    var_x5_embed = Embedding(
        input_dim=word_vocab_size,
        output_dim=word_embed_size,
        weights=[word_embed_matrix],
        input_length=word_qus_len,
        trainable=False)(
            var_x5_input)  # shape: (None, qus_length, word_embed_size)
    var_x6_embed = Embedding(
        input_dim=pos_vocab_size,
        output_dim=pos_embed_size,
        weights=[pos_embed_matrix],
        input_length=word_qus_len,
        trainable=False)(
            var_x6_input)  # shape: (None, qus_length, pos_embed_size)

    var_x1_embed = Reshape([word_ctx_len, word_char_len * char_embed_size])(
        var_x1_embed
    )  # shape: (None, ctx_length, word_length * char_embed_size)
    var_x4_embed = Reshape([word_qus_len, word_char_len * char_embed_size])(
        var_x4_embed
    )  # shape: (None, qus_length, word_length * char_embed_size)
    var_char_embed_layer = Dense(units=word_embed_size)
    var_x1_embed = TimeDistributed(
        var_char_embed_layer,
        input_shape=(word_ctx_len, word_char_len * char_embed_size))(
            var_x1_embed)  # shape: (None, ctx_length, word_embed_size)
    var_x1_embed = Activation('relu')(var_x1_embed)
    #    var_x1_embed = Dropout(rate=drop_rate)(var_x1_embed)
    var_x4_embed = TimeDistributed(
        var_char_embed_layer,
        input_shape=(word_qus_len, word_char_len * char_embed_size))(
            var_x4_embed)  # shape: (None, qus_length, word_embed_size)
    var_x4_embed = Activation('relu')(var_x4_embed)
    #    var_x4_embed = Dropout(rate=drop_rate)(var_x4_embed)

    #XXX concatenate word embedding and pos embedding directly
    var_ctx_embed = concatenate(
        [var_x1_embed, var_x2_embed, var_x3_embed], axis=2
    )  # shape: (None, ctx_length, word_embed_size * 2 + pos_embed_size)
    var_qus_embed = concatenate(
        [var_x4_embed, var_x5_embed, var_x6_embed], axis=2
    )  # shape: (None, qus_length, word_embed_size * 2 + pos_embed_size)
    var_ctx_embed = Dropout(rate=drop_rate)(var_ctx_embed)
    var_qus_embed = Dropout(rate=drop_rate)(var_qus_embed)

    var_ctx_lstm = Bidirectional(
        LSTM(units=hidden_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_ctx_embed)  # shape: (None, ctx_length, hidden_size * 2)
    var_qus_lstm = Bidirectional(
        LSTM(units=hidden_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_qus_embed)  # shape: (None, qus_length, hidden_size * 2)
    # dropout ?
    #    var_ctx_lstm = Dropout(rate=drop_rate)(var_ctx_lstm)
    #    var_qus_lstm = Dropout(rate=drop_rate)(var_qus_lstm)

    # attention layers
    var_ctx_flatten = Flatten()(
        var_ctx_lstm)  # shape: (None, ctx_length * hidden_size * 2)
    var_qus_flatten = Flatten()(
        var_qus_lstm)  # shape: (None, qus_length * hidden_size * 2)
    var_ctx_repeat = RepeatVector(word_qus_len)(
        var_ctx_flatten
    )  # shape: (None, qus_length, ctx_length * hidden_size * 2)
    var_qus_repeat = RepeatVector(word_ctx_len)(
        var_qus_flatten
    )  # shape: (None, ctx_length, qus_length * hidden_size * 2)
    var_ctx_repeat = Reshape([word_qus_len, word_ctx_len, hidden_size * 2])(
        var_ctx_repeat
    )  # shape: (None, qus_length, ctx_length, hidden_size * 2)
    var_qus_repeat = Reshape([word_ctx_len, word_qus_len, hidden_size * 2])(
        var_qus_repeat
    )  # shape: (None, ctx_length, qus_length, hidden_size * 2)
    var_ctx_repeat = Permute(
        [2, 1, 3])(var_ctx_repeat
                   )  # shape: (None, ctx_length, qus_length, hidden_size * 2)
    var_mul_repeat = multiply([
        var_ctx_repeat, var_qus_repeat
    ])  # shape: (None, ctx_length, qus_length, hidden_size * 2)

    var_sim_repeat = concatenate(
        [var_ctx_repeat, var_qus_repeat, var_mul_repeat],
        axis=3)  # shape: (None, ctx_length, qus_length, hidden_size * 6)
    var_sim_sequence = Reshape([word_ctx_len * word_qus_len, hidden_size * 6])(
        var_sim_repeat
    )  # shape: (None, ctx_length * qus_length, hidden_size * 6)
    # dropout ?
    #    var_sim_sequence = Dropout(rate=drop_rate)(var_sim_sequence)
    var_similarity = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len * word_qus_len, hidden_size * 6))(
            var_sim_sequence)  # shape: (None, ctx_length * qus_length, 1)
    var_similarity = Reshape([word_ctx_len, word_qus_len])(
        var_similarity)  # shape: (None, ctx_length, qus_length)
    var_similarity = Activation('relu')(var_similarity)
    # dropout ?
    #    var_similarity = Dropout(rate=drop_rate)(var_similarity)

    var_c2qatt_weight = TimeDistributed(
        Activation('softmax'), input_shape=(word_ctx_len, word_qus_len))(
            var_similarity)  # shape: (None, ctx_length, qus_length)
    var_c2qatt_ctx = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 1]))(
        [var_c2qatt_weight,
         var_qus_lstm])  # shape: (None, ctx_length, hidden_size * 2)

    var_q2catt_weight = Lambda(lambda x: K.max(x, axis=2))(
        var_similarity)  # shape: (None, ctx_length)
    var_q2catt_weight = RepeatVector(hidden_size * 2)(
        var_q2catt_weight)  # shape: (None, hidden_size * 2, ctx_length)
    var_q2catt_weight = Permute([2, 1])(
        var_q2catt_weight)  # shape: (None, ctx_length, hidden_size * 2)
    var_q2catt_ctx = multiply([var_q2catt_weight, var_ctx_lstm
                               ])  # shape: (None, ctx_length, hidden_size * 2)

    var_c2qctx_attmul = multiply(
        [var_ctx_lstm,
         var_c2qatt_ctx])  # shape: (None, ctx_length, hidden_size * 2)
    var_q2cctx_attmul = multiply(
        [var_ctx_lstm,
         var_q2catt_ctx])  # shape: (None, ctx_length, hidden_size * 2)
    var_attention = concatenate(
        [var_ctx_lstm, var_c2qatt_ctx, var_c2qctx_attmul, var_q2cctx_attmul],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8)
    var_attention = Activation('relu')(var_attention)
    #    # dropout ?
    #    var_attention = Dropout(rate=drop_rate)(var_attention)

    # model layers
    var_model1_lstm = Bidirectional(
        LSTM(units=model_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_attention)  # shape: (None, ctx_length, model_size * 2)
    var_model1_att = concatenate(
        [var_attention, var_model1_lstm],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8 + model_size * 2)
    # dropout ?
    #    var_model1_att = Dropout(rate=drop_rate)(var_model1_att)

    var_model2_lstm = Bidirectional(
        LSTM(units=model_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_model1_lstm)  # shape: (None, ctx_length, model_size * 2)
    var_model2_att = concatenate(
        [var_attention, var_model2_lstm],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8 + model_size * 2)
    # dropout ?
    #    var_model2_att = Dropout(rate=drop_rate)(var_model2_att)

    # output layers
    var_pointer1_weight = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))(
            var_model1_att)  # shape: (None, ctx_length, 1)
    var_pointer1_weight = Flatten()(
        var_pointer1_weight)  # shape: (None, ctx_length)
    var_pointer1 = Activation('softmax')(
        var_pointer1_weight)  # shape: (None, ctx_length)

    var_pointer2_weight = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))(
            var_model2_att)  # shape: (None, ctx_length, 1)
    var_pointer2_weight = Flatten()(
        var_pointer2_weight)  # shape: (None, ctx_length)
    var_pointer2 = Activation('softmax')(
        var_pointer2_weight)  # shape: (None, ctx_length)

    model = Model(inputs=[
        var_x1_input, var_x2_input, var_x3_input, var_x4_input, var_x5_input,
        var_x6_input
    ],
                  outputs=[var_pointer1, var_pointer2])

    adam = Adam(lr=lr)

    #    # Set loss functions ?
    #    def two_pointers_crossentropy(y_true, y_pred):
    #        p1_true, p1_pred = y_true[0], y_pred[0]
    #        p2_true, p2_pred = y_true[:,1], y_pred[1]
    #        p1_loss = categorical_crops
    # XXX use multiple loss
    model.compile(
        optimizer=adam,
        loss=['categorical_crossentropy', 'categorical_crossentropy'],
        loss_weights=[0.5, 0.5],
        metrics=['accuracy'])
    et = time.time()
    print("cost time:", et - st)

    ## evaluate
    print("evaluate")
    st = time.time()
    model = load_model('model_%s.h5' % model_name, custom_objects={'tf': tf})
    # compute predict
    print("predict")
    st = time.time()
    train_Y1_hat, train_Y2_hat = model.predict(
        [train_X1, train_X2, train_X3, train_X4, train_X5, train_X6],
        batch_size=batch_size)
    et = time.time()
    print("cost time:", et - st)
    train_Y1_word_pred, train_Y2_word_pred = model_utils.constraint_predict(
        train_Y1_hat, train_Y2_hat)
    train_Y1_pred, train_Y2_pred = data_utils.set_char_answer(
        train_context_words, train_Y1_word_pred, train_Y2_word_pred)
    train_Y1_pred = np.array(train_Y1_pred, dtype=np.int32)
    train_Y2_pred = np.array(train_Y2_pred, dtype=np.int32)
    # evaluate predict with setting answer (word answer)
    train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy(
        train_word_ans1, train_Y1_word_pred, train_word_ans2,
        train_Y2_word_pred)
    train_prec, train_rec, train_f1 = evaluation.compute_scores(
        train_word_ans1, train_Y1_word_pred, train_word_ans2,
        train_Y2_word_pred, word_ctx_len)
    print("word-level train accuracy:", train_acc1, train_acc2, train_accuracy)
    print("word-level train prec rec:", train_prec, train_rec)
    print("word-level train f1:", train_f1)
    # evaluate predict with real answer (char answer)
    train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy(
        train_ans1, train_Y1_pred, train_ans2, train_Y2_pred)
    train_prec, train_rec, train_f1 = evaluation.compute_scores(
        train_ans1, train_Y1_pred, train_ans2, train_Y2_pred, max_char_ctx_len)
    print("char-level train accuracy:", train_acc1, train_acc2, train_accuracy)
    print("char-level train prec rec:", train_prec, train_rec)
    print("char-level train f1:", train_f1)
    et = time.time()
    print("cost time:", et - st)

    ## test
    print("test")
    st = time.time()
    test_Y1_hat, test_Y2_hat = model.predict(
        [test_X1, test_X2, test_X3, test_X4, test_X5, test_X6],
        batch_size=batch_size)
    # compute predict
    test_Y1_word_pred, test_Y2_word_pred = model_utils.constraint_predict(
        test_Y1_hat, test_Y2_hat)
    test_Y1_pred, test_Y2_pred = data_utils.set_char_answer(
        test_context_words, test_Y1_word_pred, test_Y2_word_pred)
    test_Y1_pred = np.array(test_Y1_pred, dtype=np.int32)
    test_Y2_pred = np.array(test_Y2_pred, dtype=np.int32)
    data_utils.write_predict(predict_path, test_id, test_Y1_pred, test_Y2_pred)
    et = time.time()
    print("cost time:", et - st)
コード例 #16
0
    def get_feed_dict(self,
                      words,
                      fw_words,
                      bw_words,
                      dict_labels,
                      labels=None,
                      lr=None,
                      dropout=None,
                      test_flag=0):
        """words, fw_words, bw_words, labels, postags,  fw_postags, bw_postags
        Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of words.
                A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """
        # perform padding of the given data
        if self.config.chars and not self.config.posTag and not self.config.dic_flag and not self.config.morphs:
            char_ids, word_ids = zip(*words)
            _, fw_lm_ids = zip(*fw_words)
            _, bw_lm_ids = zip(*bw_words)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            fw_lm_ids, sequence_lengths = pad_sequences(fw_lm_ids, 0)
            bw_lm_ids, sequence_lengths = pad_sequences(bw_lm_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids,
                                                   pad_tok=0,
                                                   nlevels=2)
        elif self.config.chars and self.config.posTag:  #--------adding posTag padding-----------
            if self.config.dic_flag and not self.config.morphs:
                posTag_ids, char_ids, word_ids, dic_ids = zip(*words)
                fw_postag_ids, _, fw_lm_ids, _ = zip(*fw_words)
                bw_postag_ids, _, bw_lm_ids, _ = zip(*bw_words)
                dic_ids = []
                for dict in dict_labels:
                    tmp_dic1 = []
                    tmp_dic2 = []
                    tmp_dic3 = []
                    tmp_dic4 = []
                    for d_i, d in enumerate(dict['labels1']):
                        tmp_dic1.append(dict['labels1'][d_i])
                        tmp_dic2.append(dict['labels2'][d_i])
                        tmp_dic3.append(dict['labels3'][d_i])
                        tmp_dic4.append(dict['labels4'][d_i])
                        tmp_dic5.append(dict['labels5'][d_i])

                    dic_ids.append(
                        [tmp_dic1, tmp_dic2, tmp_dic3, tmp_dic4, tmp_dic5])
            elif self.config.dic_flag and self.config.morphs:
                posTag_ids, char_ids, word_ids, dic_ids, morph_ids, syl_ids = zip(
                    *words)
                fw_postag_ids, _, fw_lm_ids, _, _, _ = zip(*fw_words)
                bw_postag_ids, _, bw_lm_ids, _, _, _ = zip(*bw_words)

                dic_ids = []
                for dict in dict_labels:
                    tmp_dic1 = []
                    tmp_dic2 = []
                    tmp_dic3 = []
                    tmp_dic4 = []
                    tmp_dic5 = []
                    for d_i, d in enumerate(dict['labels1']):
                        tmp_dic1.append(dict['labels1'][d_i])
                        tmp_dic2.append(dict['labels2'][d_i])
                        tmp_dic3.append(dict['labels3'][d_i])
                        tmp_dic4.append(dict['labels4'][d_i])
                        tmp_dic5.append(dict['labels5'][d_i])

                    dic_ids.append(
                        [tmp_dic1, tmp_dic2, tmp_dic3, tmp_dic4, tmp_dic5])
            else:
                posTag_ids, char_ids, word_ids = zip(*words)
                fw_postag_ids, _, _, fw_lm_ids = zip(*fw_words)
                bw_postag_ids, _, _, bw_lm_ids = zip(*bw_words)

            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            fw_lm_ids, sequence_lengths = pad_sequences(fw_lm_ids, 0)
            bw_lm_ids, sequence_lengths = pad_sequences(bw_lm_ids, 0)

            # if self.config.dic_flag:
            #     dic_ids, sequence_lengths = pad_sequences(dic_ids, 0)

            if self.config.dic_flag:
                dic_ids, sequence_lengths = pad_sequences(dic_ids,
                                                          pad_tok=0,
                                                          nlevels=4)
                # if last_flag == False:
                dic_embeddings = np.zeros((len(word_ids), len(word_ids[0]), 6),
                                          dtype=np.float32)
                # elif last_flag == True:
                #     dic_embeddings = np.zeros((3, len(word_ids[0]), 7), dtype=np.float32)
                for batch_i, batch_dict in enumerate(dic_ids):
                    for word_i, word_dict in enumerate(batch_dict[0]):

                        dic_embeddings[batch_i][word_i][int(
                            batch_dict[0][word_i])] = 1
                        dic_embeddings[batch_i][word_i][int(
                            batch_dict[1][word_i])] = 1
                        dic_embeddings[batch_i][word_i][int(
                            batch_dict[2][word_i])] = 1
                        dic_embeddings[batch_i][word_i][int(
                            batch_dict[3][word_i])] = 1
                        dic_embeddings[batch_i][word_i][int(
                            batch_dict[4][word_i])] = 1

                dic_ids = dic_embeddings
            if self.config.morphs:
                morph_ids, morph_lengths = pad_sequences(morph_ids,
                                                         pad_tok=0,
                                                         nlevels=2)
                syl_ids, syl_lengths = pad_sequences(syl_ids,
                                                     pad_tok=0,
                                                     nlevels=2)
            fw_postag_ids, sequence_lengths = pad_sequences(fw_postag_ids, 0)
            bw_postag_ids, sequence_lengths = pad_sequences(bw_postag_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids,
                                                   pad_tok=0,
                                                   nlevels=2)
            posTag_ids, _ = pad_sequences(posTag_ids, 0)
        else:
            word_ids, morph_ids = zip(*words)
            fw_lm_ids, _ = zip(*fw_words)
            bw_lm_ids, _ = zip(*bw_words)
            morph_ids, morph_lengths = pad_sequences(morph_ids,
                                                     pad_tok=0,
                                                     nlevels=2)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            fw_lm_ids, sequence_lengths = pad_sequences(fw_lm_ids, 0)
            bw_lm_ids, sequence_lenghts = pad_sequences(bw_lm_ids, 0)

        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.fw_lm_ids: fw_lm_ids,
            self.bw_lm_ids: bw_lm_ids,
            self.sequence_lengths: sequence_lengths
        }
        #if test_flag == 1:
        #    print(word_ids)

        if self.config.posLM:
            feed[self.fw_pos_ids] = fw_postag_ids
            feed[self.bw_pos_ids] = bw_postag_ids

        if self.config.chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths

        if self.config.morphs:
            feed[self.morph_ids] = morph_ids
            feed[self.morph_lengths] = morph_lengths
            feed[self.syl_ids] = syl_ids
            feed[self.syl_lengths] = syl_lengths

        if self.config.dic_flag:
            feed[self.dic_ids] = dic_ids

        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout
        #postag add--------------------------
        if self.config.posTag:
            feed[self.posTag_ids] = posTag_ids

        return feed, sequence_lengths
コード例 #17
0
def prepare_data(GLOVE_DIR,TEXT_DATA_DIR,MAX_SEQUENCE_LENGTH,MAX_NB_WORDS
    ,EMBEDDING_DIM,VALIDATION_SPLIT,categorical=True):
    """mostly the same preprocessing as in the original post with a couple of differences.
    sklearn's CountVectorizer is used instead of keras tokenizer and the train/test split
    is done using sklearn's train_test_split().
    """
    # build index mapping words in the embeddings set to their embedding
    # vector
    print('Indexing word vectors.')
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    # second, prepare text samples and their labels
    print('Processing text dataset')
    texts = []         # list of text samples
    labels_index = {}  # dictionary mapping label name to numeric id
    labels = []        # list of label ids
    for name in sorted(os.listdir(TEXT_DATA_DIR)):
        path = os.path.join(TEXT_DATA_DIR, name)
        if os.path.isdir(path):
            label_id = len(labels_index)
            labels_index[name] = label_id
            for fname in sorted(os.listdir(path)):
                if fname.isdigit():
                    fpath = os.path.join(path, fname)
                    if sys.version_info < (3,):
                        f = open(fpath)
                    else:
                        f = open(fpath, encoding='latin-1')
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                    f.close()
                    labels.append(label_id)

    print('Found %s texts.' % len(texts))

    # vectorize the text samples into a 2D integer tensor. using sklearn
    # CountVectorizer instead of the keras tokenizer (as in the original post)
    # we obtain a slightly higher overlap between the selected words and the
    # words that have glove vectors. Ultimately this does not make a major
    # difference
    processed_texts = [clean_text(t) for t in texts]
    vectorizer = CountVectorizer(max_features=MAX_NB_WORDS)
    vectorizer_fit = vectorizer.fit_transform(processed_texts)

    # We index the words so the most common word ("the") has index 1. This is
    # irrelevant in reality I do it simply because I wanted to compare with
    # the dictionary obtained using keras.
    words  = vectorizer.get_feature_names()
    counts = vectorizer_fit.toarray().sum(axis=0)
    counts_words = list(zip(counts,words))
    counts_words.sort(reverse=True)

    vocabulary = [str(w[1]) for w in counts_words]
    word_index = dict(zip(vocabulary, range(MAX_NB_WORDS)))

    sequences = []
    for doc in processed_texts:
        sequence=[]
        for word in doc.split():
            if word not in word_index:
                continue
            sequence.append(word_index[word])
        sequences.append(sequence)

    data = np.vstack([pad_sequences(s,MAX_SEQUENCE_LENGTH) for s in sequences])
    labels = np.asarray(labels)

    # split the data into a training set and a validation set.
    x_train,x_val,y_train,y_val = train_test_split(
        data, labels, stratify=labels, test_size=VALIDATION_SPLIT)

    if categorical:
        y_train = one_hot(np.asarray(y_train))
        y_val = one_hot(np.asarray(y_val))

    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', y_train.shape)

    print('Preparing embedding matrix.')

    # prepare embedding matrix
    num_words = MAX_NB_WORDS
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return x_train, y_train, x_val, y_val, embedding_matrix
コード例 #18
0
ファイル: model.py プロジェクト: vutran0230/DOER
    def get_feed_dict(self,
                      words,
                      poss,
                      chunks,
                      labels_aspect=None,
                      labels_polarity=None,
                      labels_joint=None,
                      lr=None,
                      dropout=None,
                      vocab_aspect_tags=None):
        """
        Given some data, pad it and build a feed dictionary
        Args:
                words: list of sentences. A sentence is a list of ids of a list of words.
                        A word is a list of ids
                poss: list of poss_ids
                chunks: list of chunks_ids
                labels_aspect: list of labels_aspect_ids
                lr: (float) learning rate
                dropout: (float) keep prob
        Returns:
                dict {placeholder: value}
        """
        # perform padding of the given data
        word_ids, sequence_lengths = pad_sequences(
            words, self.config.n_words, self.config.max_sentence_size)

        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if poss is not None:
            poss, _ = pad_sequences(poss, self.config.n_poss,
                                    self.config.max_sentence_size)
            feed[self.pos_ids] = poss

        if chunks is not None:
            if self.config.use_mpqa:
                chunks, _ = pad_sequences(chunks, 0,
                                          self.config.max_sentence_size)
            else:
                chunks, _ = pad_sequences(chunks, self.config.n_chunks,
                                          self.config.max_sentence_size)
            feed[self.chunk_ids] = chunks

        if self.config.use_labels_length:
            if labels_aspect is not None and vocab_aspect_tags is not None:
                labels_average_ = labels_average_length(
                    labels_aspect, vocab_aspect_tags)
                feed[self.labels_aspect_average_length] = labels_average_

        if labels_aspect is not None:
            labels_aspect, _ = pad_sequences(labels_aspect, 0,
                                             self.config.max_sentence_size)
            feed[self.labels_aspect] = labels_aspect

        if labels_polarity is not None:
            labels_polarity, _ = pad_sequences(labels_polarity, 0,
                                               self.config.max_sentence_size)
            feed[self.labels_polarity] = labels_polarity

        if labels_joint is not None:
            labels_joint, _ = pad_sequences(labels_joint, 0,
                                            self.config.max_sentence_size)
            feed[self.labels_joint] = labels_joint

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths