def get_feed_dict(self, words, labels=None, lr=None, dropout=None): """ Given some data, pad it and build a feed dictionary Args: words: list of sentences. A sentence is a list of ids of a list of words. A word is a list of ids labels: list of ids lr: (float) learning rate dropout: (float) keep prob Returns: dict {placeholder: value} """ # perform padding of the given data if self.config.chars: char_ids, word_ids = zip(*words) word_ids, sequence_lengths = pad_sequences(word_ids, 0) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) else: word_ids, sequence_lengths = pad_sequences(words, 0) # build feed dictionary feed = { self.word_ids: word_ids, self.sequence_lengths: sequence_lengths } if self.config.chars: feed[self.char_ids] = char_ids feed[self.word_lengths] = word_lengths if labels is not None: labels, _ = pad_sequences(labels, 0) feed[self.labels] = labels if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout return feed, sequence_lengths
def get_feed_dict(self, words, labels=None, lr=None, dropout=None): char_ids, word_ids = zip(*words) word_ids, sequence_lengths = pad_sequences(word_ids, 0) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) feed = { self.word_ids: word_ids, self.sequence_lengths: sequence_lengths, self.char_ids: char_ids, self.word_lengths: word_lengths } if labels is not None: labels, _ = pad_sequences(labels, 0) feed[self.labels] = labels if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout return feed, sequence_lengths
def out(self, sentences, out_file=None): ''' :param sentences: 支持两种输入格式,1种是输入txt文件,一种是输入list :return: ''' sentences_list = [[char for char in sen] for sen in sentences] sentences_list, sequences_len = pad_sequences(sentences_list, self.maxLen) sentences_idx = sequences2idx(sentences_list, self.char2idx) sequences_len = [ seq if seq <= self.maxLen else self.maxLen for seq in sequences_len ] if type(out_file) == str: fw = open(out_file, 'wt', encoding='utf-8') with tf.Session() as sess: self.saver.restore(sess, self.model_path + self.model_name) labels = [0] * len(sentences_idx) # 这个是没用的 pred_labels = [] for (bat_sens, _, bat_seqs_len) in batch_yield(sentences_idx, labels, sequences_len, bs=500): feed_dict = { self.sentences: bat_sens, self.sequences_len: bat_seqs_len, self.dropout_keep_prob: 1.0 } hidden_scores, transition_params = sess.run( [self.hidden_scores, self.transition_params], feed_dict=feed_dict) bat_labels = [] for scocre, seq_len in zip(hidden_scores, bat_seqs_len): labs, _ = viterbi_decode(scocre[:seq_len], transition_params) bat_labels.append(list(labs)) pred_labels += [[self.idx2tag[idx] for idx in labs] for labs in bat_labels] result = [] for one_lab, one_sen_str in zip(pred_labels, sentences): result.append(self.get_prediction(one_lab, one_sen_str)) if type(out_file) == str: self._out_file(result, out_file) return result
def train(self, batch_generator, max_steps, save_path, save_every_n, log_every_n): self.session = tf.Session() with self.session as sess: sess.run(tf.global_variables_initializer()) # Train network step = 0 for x, y in batch_generator: inputs, sequence_lengths = pad_sequences(x, pad_mark=0) targets, _ = pad_sequences(y, pad_mark=0) step += 1 start = time.time() feed = { self.inputs: inputs, self.targets: targets, self.sequence_lengths: sequence_lengths, self.keep_prob: self.train_keep_prob } batch_loss, _ = sess.run([self.loss, self.optimizer], feed_dict=feed) end = time.time() # control the print lines if step % log_every_n == 0: print('step: {}/{}... '.format(step, max_steps), 'loss: {:.4f}... '.format(batch_loss), '{:.4f} sec/batch'.format((end - start))) if (step % save_every_n == 0): self.saver.save(sess, os.path.join(save_path, 'model'), global_step=step) if step >= max_steps: break self.saver.save(sess, os.path.join(save_path, 'model'), global_step=step)
def get_feed_dict(self, batch, type='Train', dropout=None): # input is ids # batch_size = len(batch) # dynamically calculate batch size if type == 'Train': input_ids, label_ids, sequence_lengths, input_mask = [], [], [], [] if self.soft_masked: label_bio = [] for data in batch: input_len = len(data['token_ids']) input_ids.append(data['token_ids']) input_mask.append([1]*input_len) label_ids.append(data['labels']) sequence_lengths.append(input_len) if self.soft_masked: label_bio.append([self.tags_dict[x] for x in data['bio']]) max_length = min(max(sequence_lengths), self.bert_max_len) # print(217,label_ids[0]) feed = {self.bert_api.input_ids: pad_sequences(input_ids, max_length), self.bert_api.input_mask: pad_sequences(input_mask, max_length), self.labels: pad_sequences(label_ids, max_length), self.bert_api.sequence_lengths: sequence_lengths, self.bert_api.dropout: dropout} if self.soft_masked: feed[self.label_bio] = pad_sequences(label_bio,max_length) return feed, sequence_lengths elif type == 'Eval': input_ids, label_bio, sequence_lengths, input_mask = [], [], [], [] for data in batch: input_len = len(data['token_ids']) # print(254, data['token_ids'], len(data['token_ids'])) input_ids.append(data['token_ids']) input_mask.append([1]*input_len) label_bio.append(data['bio'][1:-1]) sequence_lengths.append(input_len) max_length = min(max(sequence_lengths), self.bert_max_len) feed = {self.bert_api.input_ids: pad_sequences(input_ids, max_length), self.bert_api.input_mask: pad_sequences(input_mask, max_length), self.bert_api.sequence_lengths: sequence_lengths} return feed, input_ids, label_bio, sequence_lengths elif type == 'Pred': #to be finished sequence_lengths = [len(x) for x in batch] max_length = min(max(sequence_lengths), self.bert_max_len) feed = {self.bert_api.input_ids: pad_sequences(batch, max_length), self.bert_api.sequence_lengths: sequence_lengths } return feed, sequence_lengths
def get_feed_dict(self, words, lr=None, dropout=None, iob=None, mention_type=None, mentions=None, mention_size=None): """ Given some data, pad it and build a feed dictionary Args: words: list of sentences. A sentence is a list of ids of a list of words. A word is a list of ids labels: list of ids lr: (float) learning rate dropout: (float) keep prob Returns: dict {placeholder: value} """ # perform padding of the given data if self.config.chars: char_ids, word_ids = zip(*words) word_ids, sequence_lengths = pad_sequences(word_ids, 0) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) else: word_ids, sequence_lengths = pad_sequences(words, 0) # build feed dictionary feed = { self.word_ids: word_ids, self.sequence_lengths: sequence_lengths } if self.config.chars: feed[self.char_ids] = char_ids feed[self.word_lengths] = word_lengths if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout if iob is not None: feed[self.iob_type], _ = pad_sequences(iob, 0) mention_num = 0 if mention_type is not None: feed[self.mention_type], _ = pad_sequences(mention_type, 0) if mentions is not None: feed[self.mention], mention_length = pad_sequences(mentions, pad_tok=0, nlevels=2) feed[self.mention_length] = mention_length feed[self.mention_size] = mention_size return feed, sequence_lengths
def get_feed_dict(self, words, mor_tags=None, lex_tags=None, labels=None, lr=None, dropout=None): """ Given some data, pad it and build a feed dictionary Args: words: list of sentences. A sentence is a list of ids of a list of words. A word is a list of ids labels: list of ids lr: (float) learning rate dropout: (float) keep prob Returns: dict {placeholder: value} """ # perform padding of the given data if self.config.chars: char_ids, word_ids = zip(*words) word_ids, sequence_lengths = pad_sequences(word_ids, 0) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) else: word_ids, sequence_lengths = pad_sequences(words, 0) # build feed dictionary feed = { self.word_ids: word_ids, self.sequence_lengths: sequence_lengths } if self.config.chars: feed[self.char_ids] = char_ids feed[self.word_lengths] = word_lengths self.cnn_word_lengths = word_lengths if lex_tags is not None: lex_tags, _ = pad_sequences(lex_tags, 0) # add two hot code here batch_arr = [] for b_i, sentence in enumerate(lex_tags): sentence_arr = [] for w_i, each_word_lex in enumerate(sentence): word_lex_hot = list([0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) if isinstance(each_word_lex, str) and ',' in each_word_lex: for word in each_word_lex.split(','): word_idx = int(word) word_lex_hot[word_idx] = 1.0 else: word_lex_hot[each_word_lex] = 1.0 sentence_arr.append(word_lex_hot) batch_arr.append(sentence_arr) feed[self.lex_tags] = batch_arr if mor_tags is not None: mor_tags, _ = pad_sequences(mor_tags, 0) feed[self.mor_tags] = mor_tags if labels is not None: labels, _ = pad_sequences(labels, 0) feed[self.labels] = labels if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout return feed, sequence_lengths
def get_feed_dict(self, words, pos=None, labels=None, lr=None, dropout=None): """ Given some data, pad it and build a feed dictionary Args: words: list of sentences. A sentence is a list of ids of a list of words. A word is a list of ids labels: list of ids lr: (float) learning rate dropout: (float) keep prob Returns: dict {placeholder: value} """ # perform padding of the given data if self.config.chars: char_ids, word_ids = zip(*words) for _wi in word_ids: for _ww in _wi: w = self.vocab_words[_ww] w = w.split("/") if len(w) > 2: temp_w = "" for _wd in w[:-1]: temp_w += _wd w = temp_w print(w) '''else: w = w[0]''' #print(self.processing_dics.get(self.vocab_words[_ww])) word_ids, sequence_lengths = pad_sequences(word_ids, 0) pos_ids, _ = pad_sequences(pos, 0) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) else: word_ids, sequence_lengths = pad_sequences(words, 0) pos_ids, _ = pad_sequences(pos, 0) feed = { self.word_ids: word_ids, self.sequence_lengths: sequence_lengths } feed[self.pos_ids] = pos_ids if self.config.chars: feed[self.char_ids] = char_ids feed[self.word_lengths] = word_lengths if labels is not None: labels, _ = pad_sequences(labels, 0) feed[self.labels] = labels if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout return feed, sequence_lengths
def get_feed_dict(self, words, labels=None, pred_flags=None, lr=None, dropout=None): feed, sequence_lengths = super().get_feed_dict(words, labels=labels, lr=lr, dropout=dropout) _pred_flags, _ = pad_sequences(pred_flags, 0) feed[self.parent.pred_flag] = _pred_flags return feed, sequence_lengths
def get_feed_dict(self, words, poss, chunks, labels=None, btup_idx_list=None, btup_words_list=None, btup_depwords_list=None, btup_deprels_list=None, btup_depwords_length_list=None, upbt_idx_list=None, upbt_words_list=None, upbt_depwords_list=None, upbt_deprels_list=None, upbt_depwords_length_list=None, btup_formidx_list=None, upbt_formidx_list=None, lr=None, dropout=None): """ Given some data, pad it and build a feed dictionary """ # perform padding of the given data if self.config.chars: char_ids, word_ids = zip(*words) word_ids, sequence_lengths = pad_sequences(word_ids, self.nwords, self.max_sentence_size, self.max_word_size) char_ids, word_lengths = pad_sequences(char_ids, self.nchars, self.max_sentence_size, self.max_word_size, nlevels=2) else: word_ids, sequence_lengths = pad_sequences(words, self.nwords, self.max_sentence_size, self.max_word_size) # build feed dictionary feed = { self.word_ids: word_ids, self.sequence_lengths: sequence_lengths } if self.config.chars: feed[self.char_ids] = char_ids feed[self.word_lengths] = word_lengths if labels is not None: labels, _ = pad_sequences(labels, 2, self.max_sentence_size, self.max_word_size) feed[self.labels] = labels if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout # Begin using deps tree feed[self.tbatch_size] = len(btup_idx_list) if btup_idx_list is not None: btup_idx_list, _ = pad_sequences(btup_idx_list, -1, self.max_sentence_size) feed[self.btup_word_orders] = btup_idx_list if btup_words_list is not None: btup_words_list, _ = pad_sequences(btup_words_list, self.nwords, self.max_sentence_size) feed[self.btup_word_ids] = btup_words_list if btup_depwords_list is not None: btup_depwords_list, _ = pad_sequences(btup_depwords_list, -1, self.max_sentence_size, self.max_btup_deps_len, nlevels=2) feed[self.btup_deps_ids] = btup_depwords_list if btup_deprels_list is not None: btup_deprels_list, _ = pad_sequences(btup_deprels_list, self.nrels, self.max_sentence_size, self.max_btup_deps_len, nlevels=2) feed[self.btup_rels_ids] = btup_deprels_list if btup_depwords_length_list is not None: btup_depwords_length_list, _ = pad_sequences( btup_depwords_length_list, 0, self.max_sentence_size) feed[self.btup_deps_lens] = btup_depwords_length_list if upbt_idx_list is not None: upbt_idx_list, _ = pad_sequences(upbt_idx_list, -1, self.max_sentence_size) feed[self.upbt_word_orders] = upbt_idx_list if upbt_words_list is not None: upbt_words_list, _ = pad_sequences(upbt_words_list, self.nwords, self.max_sentence_size) feed[self.upbt_word_ids] = upbt_words_list if upbt_depwords_list is not None: upbt_depwords_list, _ = pad_sequences(upbt_depwords_list, -1, self.max_sentence_size, self.max_upbt_deps_len, nlevels=2) feed[self.upbt_deps_ids] = upbt_depwords_list if upbt_deprels_list is not None: upbt_deprels_list, _ = pad_sequences(upbt_deprels_list, self.nrels, self.max_sentence_size, self.max_upbt_deps_len, nlevels=2) feed[self.upbt_rels_ids] = upbt_deprels_list if upbt_depwords_length_list is not None: upbt_depwords_length_list, _ = pad_sequences( upbt_depwords_length_list, 0, self.max_sentence_size) feed[self.upbt_deps_lens] = upbt_depwords_length_list if btup_formidx_list is not None: btup_formidx_list, _ = pad_sequences(btup_formidx_list, -1, self.max_sentence_size) feed[self.btup_formidxs] = btup_formidx_list if upbt_formidx_list is not None: upbt_formidx_list, _ = pad_sequences(upbt_formidx_list, -1, self.max_sentence_size) feed[self.upbt_formidxs] = upbt_formidx_list return feed, sequence_lengths
args.idx2tag) f1 = evaluate(test_labels, pred_labels) print(f1) print(test_sentences[10:15]) print('test:', test_labels[10:15]) print('pred:', pred_labels[10:15]) elif out_args.mode == 'demo': print('————————————demo————————————') model = biLstm_crf_model(args) model.load_weights(args.model_path + args.model_name + '.h5') while True: print('pleace input a sentence:') one_sentence_str = input() if one_sentence_str != '': one_sentence_list = [[char for char in one_sentence_str]] one_sentence_list, seq_len = pad_sequences( one_sentence_list, args.maxLen) one_sentence_idx = sequences2idx(one_sentence_list, args.char2idx) ##二维列表,注意 predictions = model.predict( np.array(one_sentence_idx)) #这个是不是有点问题啊?转移矩阵去哪里啦? one_label = get_pred_labels(predictions, seq_len, args.idx2tag)[0] print(one_label) prediction_list = args.get_prediction(one_label, one_sentence_str) print(prediction_list, '\n')
# Hyperparameters dim_word = 300 dim_char = 100 hidden_size_char = 100 # lstm on chars hidden_size_lstm = 300 # lstm on word embeddings nepochs = args.epochs lr = 0.0105 lr_decay = 0.0005 batch_size = 10 dropout = 0.5 # Process training dataset print('Creating training dataset...') words, labels = list(minibatches(train, len(train)))[0] # NOTE: len(train) will return entire dataset! char_ids, word_ids = zip(*words) word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=pad_tag) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=pad_tag, nlevels=2) labels, _ = pad_sequences(labels, pad_tok=pad_tag) # Convert word and char ids to np arrays; one-hot encode labels char_ids_arr = np.array(char_ids) word_ids_arr = np.array(word_ids) labels_arr = np.array(labels) labels_arr_one_hot = np.eye(n_labels)[labels_arr] # Process validation dataset print('Creating validation dataset...') words_valid, labels_valid = list(minibatches(valid, len(valid)))[0] char_ids_valid, word_ids_valid = zip(*words_valid) word_ids_valid, sequence_lengths_valid = pad_sequences(word_ids_valid, pad_tok=pad_tag) char_ids_valid, word_lengths_valid = pad_sequences(char_ids_valid, pad_tok=pad_tag, nlevels=2)
def get_feed_dict(self, words, labels=None, lr=None, dropout=None): """ Given some data, pad it and build a feed dictionary Args: words: list of sentences. A sentence is a list of ids of a list of words. A word is a list of ids labels: list of ids lr: (float) learning rate dropout: (float) keep prob Returns: dict {placeholder: value} """ # perform padding of the given data if self.config.chars: #print (words[0]) char_ids, pref_ids, suff_ids, pref_ids_2, suff_ids_2, pref_ids_4, suff_ids_4, word_ids = zip( *words) ##, pref_ids, suff_ids, word_ids, sequence_lengths = pad_sequences(word_ids, 0) ##################################################################### pref_ids, sequence_lengths_pref = pad_sequences(pref_ids, 0) suff_ids, sequence_lengths_suff = pad_sequences(suff_ids, 0) pref_ids_2, sequence_lengths_pref_2 = pad_sequences(pref_ids_2, 0) suff_ids_2, sequence_lengths_suff_2 = pad_sequences(suff_ids_2, 0) pref_ids_4, sequence_lengths_pref_4 = pad_sequences(pref_ids_4, 0) suff_ids_4, sequence_lengths_suff_4 = pad_sequences(suff_ids_4, 0) ##################################################################### char_ids, word_lengths = pad_sequences( char_ids, pad_tok=0, nlevels=2 ) ##################################################################### Orig else: word_ids, sequence_lengths = pad_sequences(words, 0) # build feed dictionary feed = { ########################### Same for suffix prefix self.word_ids: word_ids, self.sequence_lengths: sequence_lengths, ###### The sequence length will be same for all these features: words, suffix, prefix, suffix_2 and suffix_3. self.pref_ids: pref_ids, self.sequence_lengths: sequence_lengths_pref, self.suff_ids: suff_ids, self.sequence_lengths: sequence_lengths_suff, self.pref_ids_2: pref_ids_2, self.sequence_lengths: sequence_lengths_pref_2, self.suff_ids_2: suff_ids_2, self.sequence_lengths: sequence_lengths_suff_2, self.pref_ids_4: pref_ids_4, self.sequence_lengths: sequence_lengths_pref_4, self.suff_ids_4: suff_ids_4, self.sequence_lengths: sequence_lengths_suff_4 } if self.config.chars: feed[self.char_ids] = char_ids feed[self.word_lengths] = word_lengths ### if labels is not None: labels, _ = pad_sequences(labels, 0) feed[self.labels] = labels if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout return feed, sequence_lengths
def test_seq_padding(): a = np.array([[1, 2, 3, 5], [2, 3, 2], [3, 1, 4, 1, 5, 9]]) seq, length = pad_sequences(a, 0)
def test(): ## argument train_path = sys.argv[1] test_path = sys.argv[2] predict_path = sys.argv[3] model_name = sys.argv[4] char_embed_path = sys.argv[5] word_embed_path = sys.argv[6] pos_embed_path = sys.argv[7] dict_path = sys.argv[8] train_rate = 0.9 max_char_ctx_len = 1160 max_word_ctx_len = 680 char_ctx_len = 1160 char_qus_len = 240 word_ctx_len = 400 word_qus_len = 40 word_char_len = 5 char_embed_size = 128 word_embed_size = 128 pos_embed_size = 32 hidden_size = 64 model_size = 64 max_epochs = 50 batch_size = 8 lr = 0.001 drop_rate = 0.5 recur_drop_rate = 0.0 patience = 20 ## load data print("load data") st = time.time() train_raw_data = data_utils.load_json_data(train_path) test_raw_data = data_utils.load_json_data(test_path) # # load pos data # train_gen_pos_data = data_utils.load_json_data(train_pos_path) # test_gen_pos_data = data_utils.load_json_data(test_pos_path) # load embedding char_embedding = word2vec.Word2Vec.load(char_embed_path) word_embedding = word2vec.Word2Vec.load(word_embed_path) pos_embedding = word2vec.Word2Vec.load(pos_embed_path) et = time.time() print("cost time:", et - st) ## process data print("process data") st = time.time() train_data = data_utils.make_train_data( train_raw_data ) # data format: (id, context, question, answer_start, answer_end) test_data = data_utils.make_test_data( test_raw_data) # data format: (id, context, question) train_context = [data[1] for data in train_data] train_question = [data[2] for data in train_data] train_char_answer_start = [data[3] for data in train_data] train_char_answer_end = [data[4] for data in train_data] # train_context_poss = [data['context'] for data in train_gen_pos_data['data']] # train_question_poss = [data['question'] for data in train_gen_pos_data['data']] test_id = [data[0] for data in test_data] test_context = [data[1] for data in test_data] test_question = [data[2] for data in test_data] # test_context_poss = [data['context'] for data in test_gen_pos_data['data']] # test_question_poss = [data['question'] for data in test_gen_pos_data['data']] del train_data del test_data et = time.time() print("cost time:", et - st) ## load vocabulary print("load vocabulary") st = time.time() char_vocab = data_utils.load_json_data('model_%s_char_vocab.json' % model_name) word_vocab = data_utils.load_json_data('model_%s_word_vocab.json' % model_name) pos_vocab = data_utils.load_json_data('model_%s_pos_vocab.json' % model_name) # poss = train_context_poss + train_question_poss + test_context_poss + test_question_poss # pos_vocab, rev_pos_vocab = data_utils.build_vocabulary_with_embedding(poss, pos_embedding) char_vocab_size = len(char_vocab) word_vocab_size = len(word_vocab) pos_vocab_size = len(pos_vocab) et = time.time() print("char vocab size:", char_vocab_size) print("word vocab size:", word_vocab_size) print("pos vocab size:", pos_vocab_size) print("cost time:", et - st) ## tokenize data print("tokenize data") st = time.time() train_context_chars = data_utils.tokenize_to_chars(train_context) train_question_chars = data_utils.tokenize_to_chars(train_question) test_context_chars = data_utils.tokenize_to_chars(test_context) test_question_chars = data_utils.tokenize_to_chars(test_question) train_context_words = data_utils.tokenize_to_words(train_context, init_dict=True, dict_path=dict_path) train_question_words = data_utils.tokenize_to_words(train_question, init_dict=True, dict_path=dict_path) test_context_words = data_utils.tokenize_to_words(test_context, init_dict=True, dict_path=dict_path) test_question_words = data_utils.tokenize_to_words(test_question, init_dict=True, dict_path=dict_path) train_context_poss = data_utils.tokenize_to_poss(train_context, init_dict=True, dict_path=dict_path) train_question_poss = data_utils.tokenize_to_poss(train_question, init_dict=True, dict_path=dict_path) test_context_poss = data_utils.tokenize_to_poss(test_context, init_dict=True, dict_path=dict_path) test_question_poss = data_utils.tokenize_to_poss(test_question, init_dict=True, dict_path=dict_path) et = time.time() print("cost time:", et - st) ## select data # select the data which sequence lengths satisfy length constraints print("select data") st = time.time() select_indices = data_utils.select_data_by_lengths(train_context_words, train_question_words, word_ctx_len, word_qus_len) train_context_chars = [train_context_chars[i] for i in select_indices] train_context_words = [train_context_words[i] for i in select_indices] train_context_poss = [train_context_poss[i] for i in select_indices] train_question_chars = [train_question_chars[i] for i in select_indices] train_question_words = [train_question_words[i] for i in select_indices] train_question_poss = [train_question_poss[i] for i in select_indices] train_char_answer_start = [ train_char_answer_start[i] for i in select_indices ] train_char_answer_end = [train_char_answer_end[i] for i in select_indices] et = time.time() print("cost time:", et - st) ## set answer # it should be done after tokenize sentences to words print("set answer") st = time.time() train_word_answer_start, train_word_answer_end = data_utils.set_word_answer( train_context_words, train_char_answer_start, train_char_answer_end, word_ctx_len) train_answer_start, train_answer_end = train_word_answer_start, train_word_answer_end et = time.time() print("cost time:", et - st) ## pad data print("pad data") st = time.time() # clip words to chars # it should be done after build vocab (add PAD) train_context_clip_chars = data_utils.clip_words_to_chars( train_context_words, word_char_len) train_question_clip_chars = data_utils.clip_words_to_chars( train_question_words, word_char_len) test_context_clip_chars = data_utils.clip_words_to_chars( test_context_words, word_char_len) test_question_clip_chars = data_utils.clip_words_to_chars( test_question_words, word_char_len) # print("Debug: tarin_context_clip_chars[0]:") # print(train_context_clip_chars[0]) # print("Debug: train_question_clip_chars[0]:") # print(train_question_clip_chars[0]) # padding train_context_pad_chars = data_utils.pad_sequences( train_context_clip_chars, word_ctx_len * word_char_len) train_question_pad_chars = data_utils.pad_sequences( train_question_clip_chars, word_qus_len * word_char_len) train_context_pad_words = data_utils.pad_sequences(train_context_words, word_ctx_len) train_question_pad_words = data_utils.pad_sequences( train_question_words, word_qus_len) train_context_pad_poss = data_utils.pad_sequences(train_context_poss, word_ctx_len) train_question_pad_poss = data_utils.pad_sequences(train_question_poss, word_qus_len) test_context_pad_chars = data_utils.pad_sequences( test_context_clip_chars, word_ctx_len * word_char_len) test_question_pad_chars = data_utils.pad_sequences( test_question_clip_chars, word_qus_len * word_char_len) test_context_pad_words = data_utils.pad_sequences(test_context_words, word_ctx_len) test_question_pad_words = data_utils.pad_sequences(test_question_words, word_qus_len) test_context_pad_poss = data_utils.pad_sequences(test_context_poss, word_ctx_len) test_question_pad_poss = data_utils.pad_sequences(test_question_poss, word_qus_len) et = time.time() print("cost time:", et - st) ## make arrays print("make arrays") st = time.time() # map vocab to index # print("Debug: train_context_pad_words[0]:") # print(train_context_pad_words[0]) # print("Debug: train_question_pad_words[0]:") # print(train_question_pad_words[0]) train_context_char_indices = data_utils.map_vocabulary_index( train_context_pad_chars, char_vocab) train_question_char_indices = data_utils.map_vocabulary_index( train_question_pad_chars, char_vocab) train_context_word_indices = data_utils.map_vocabulary_index( train_context_pad_words, word_vocab) train_question_word_indices = data_utils.map_vocabulary_index( train_question_pad_words, word_vocab) train_context_pos_indices = data_utils.map_vocabulary_index( train_context_pad_poss, pos_vocab) train_question_pos_indices = data_utils.map_vocabulary_index( train_question_pad_poss, pos_vocab) test_context_char_indices = data_utils.map_vocabulary_index( test_context_pad_chars, char_vocab) test_question_char_indices = data_utils.map_vocabulary_index( test_question_pad_chars, char_vocab) test_context_word_indices = data_utils.map_vocabulary_index( test_context_pad_words, word_vocab) test_question_word_indices = data_utils.map_vocabulary_index( test_question_pad_words, word_vocab) test_context_pos_indices = data_utils.map_vocabulary_index( test_context_pad_poss, pos_vocab) test_question_pos_indices = data_utils.map_vocabulary_index( test_question_pad_poss, pos_vocab) # make one-hot label train_answer_start_onehot = data_utils.one_hot_encoding( train_answer_start, word_ctx_len) train_answer_end_onehot = data_utils.one_hot_encoding( train_answer_end, word_ctx_len) # to array # X1: context chars; X2: context words; X3: context poss; # X4: question chars; X5: question words; X6: question poss; # Y1: answer_start, Y2: answer_end train_X1 = np.array(train_context_char_indices, dtype=np.int32) train_X2 = np.array(train_context_word_indices, dtype=np.int32) train_X3 = np.array(train_context_pos_indices, dtype=np.int32) train_X4 = np.array(train_question_char_indices, dtype=np.int32) train_X5 = np.array(train_question_word_indices, dtype=np.int32) train_X6 = np.array(train_question_pos_indices, dtype=np.int32) train_Y1 = np.array(train_answer_start_onehot, dtype=np.int32) train_Y2 = np.array(train_answer_end_onehot, dtype=np.int32) train_word_ans1 = np.array(train_answer_start, dtype=np.int32) train_word_ans2 = np.array(train_answer_end, dtype=np.int32) train_ans1 = np.array(train_char_answer_start, dtype=np.int32) train_ans2 = np.array(train_char_answer_end, dtype=np.int32) test_X1 = np.array(test_context_char_indices, dtype=np.int32) test_X2 = np.array(test_context_word_indices, dtype=np.int32) test_X3 = np.array(test_context_pos_indices, dtype=np.int32) test_X4 = np.array(test_question_char_indices, dtype=np.int32) test_X5 = np.array(test_question_word_indices, dtype=np.int32) test_X6 = np.array(test_question_pos_indices, dtype=np.int32) # make embedding weight matrix word_embed_matrix = data_utils.make_embedding_matrix( word_embedding, word_vocab, word_embed_size) char_embed_matrix = data_utils.make_embedding_matrix( char_embedding, char_vocab, char_embed_size) pos_embed_matrix = data_utils.make_embedding_matrix( pos_embedding, pos_vocab, pos_embed_size) # delete data for releasing memory del train_context, train_question, test_context, test_question del train_context_chars, train_question_chars, test_context_chars, test_question_chars # del train_context_words, train_question_words, test_context_words, test_question_words del train_context_clip_chars, train_question_clip_chars, test_context_clip_chars, test_question_clip_chars del train_context_char_indices, train_question_char_indices, test_context_char_indices, test_question_char_indices del train_context_word_indices, train_question_word_indices, test_context_word_indices, test_question_word_indices del train_context_pos_indices, train_question_pos_indices, test_context_pos_indices, test_question_pos_indices del train_word_answer_start, train_word_answer_end, train_char_answer_start, train_char_answer_end del train_answer_start_onehot, train_answer_end_onehot et = time.time() print("train shape:", train_X1.shape, train_X2.shape, train_X3.shape, train_X4.shape, train_X5.shape, train_X6.shape, train_Y1.shape, train_Y2.shape) print("test shape:", test_X1.shape, test_X2.shape, test_X3.shape, test_X4.shape, test_X5.shape, test_X6.shape) print("cost time:", et - st) ## XXX build model print("build model") st = time.time() # input layers # X1: context chars; X2: context words; X3: context poss; # X4: question chars; X5: question words; X6: question poss; # Y1: answer_start; Y2: answer_end var_x1_input = Input(shape=(word_ctx_len * word_char_len, ), dtype=np.int32) var_x2_input = Input(shape=(word_ctx_len, ), dtype=np.int32) var_x3_input = Input(shape=(word_ctx_len, ), dtype=np.int32) var_x4_input = Input(shape=(word_qus_len * word_char_len, ), dtype=np.int32) var_x5_input = Input(shape=(word_qus_len, ), dtype=np.int32) var_x6_input = Input(shape=(word_qus_len, ), dtype=np.int32) # embedding layers var_x1_embed = Embedding( input_dim=char_vocab_size, output_dim=char_embed_size, weights=[char_embed_matrix], input_length=word_ctx_len * word_char_len, trainable=False )(var_x1_input) # shape: (None, ctx_length * word_length, char_embed_size) var_x2_embed = Embedding( input_dim=word_vocab_size, output_dim=word_embed_size, weights=[word_embed_matrix], input_length=word_ctx_len, trainable=False)( var_x2_input) # shape: (None, ctx_length, word_embed_size) var_x3_embed = Embedding( input_dim=pos_vocab_size, output_dim=pos_embed_size, weights=[pos_embed_matrix], input_length=word_ctx_len, trainable=False)( var_x3_input) # shape: (None, ctx_length, pos_embed_size) var_x4_embed = Embedding( input_dim=char_vocab_size, output_dim=char_embed_size, weights=[char_embed_matrix], input_length=word_qus_len * word_char_len, trainable=False )(var_x4_input) # shape: (None, qus_length * word_length, char_embed_size) var_x5_embed = Embedding( input_dim=word_vocab_size, output_dim=word_embed_size, weights=[word_embed_matrix], input_length=word_qus_len, trainable=False)( var_x5_input) # shape: (None, qus_length, word_embed_size) var_x6_embed = Embedding( input_dim=pos_vocab_size, output_dim=pos_embed_size, weights=[pos_embed_matrix], input_length=word_qus_len, trainable=False)( var_x6_input) # shape: (None, qus_length, pos_embed_size) var_x1_embed = Reshape([word_ctx_len, word_char_len * char_embed_size])( var_x1_embed ) # shape: (None, ctx_length, word_length * char_embed_size) var_x4_embed = Reshape([word_qus_len, word_char_len * char_embed_size])( var_x4_embed ) # shape: (None, qus_length, word_length * char_embed_size) var_char_embed_layer = Dense(units=word_embed_size) var_x1_embed = TimeDistributed( var_char_embed_layer, input_shape=(word_ctx_len, word_char_len * char_embed_size))( var_x1_embed) # shape: (None, ctx_length, word_embed_size) var_x1_embed = Activation('relu')(var_x1_embed) # var_x1_embed = Dropout(rate=drop_rate)(var_x1_embed) var_x4_embed = TimeDistributed( var_char_embed_layer, input_shape=(word_qus_len, word_char_len * char_embed_size))( var_x4_embed) # shape: (None, qus_length, word_embed_size) var_x4_embed = Activation('relu')(var_x4_embed) # var_x4_embed = Dropout(rate=drop_rate)(var_x4_embed) #XXX concatenate word embedding and pos embedding directly var_ctx_embed = concatenate( [var_x1_embed, var_x2_embed, var_x3_embed], axis=2 ) # shape: (None, ctx_length, word_embed_size * 2 + pos_embed_size) var_qus_embed = concatenate( [var_x4_embed, var_x5_embed, var_x6_embed], axis=2 ) # shape: (None, qus_length, word_embed_size * 2 + pos_embed_size) var_ctx_embed = Dropout(rate=drop_rate)(var_ctx_embed) var_qus_embed = Dropout(rate=drop_rate)(var_qus_embed) var_ctx_lstm = Bidirectional( LSTM(units=hidden_size, recurrent_dropout=recur_drop_rate, return_sequences=True))( var_ctx_embed) # shape: (None, ctx_length, hidden_size * 2) var_qus_lstm = Bidirectional( LSTM(units=hidden_size, recurrent_dropout=recur_drop_rate, return_sequences=True))( var_qus_embed) # shape: (None, qus_length, hidden_size * 2) # dropout ? # var_ctx_lstm = Dropout(rate=drop_rate)(var_ctx_lstm) # var_qus_lstm = Dropout(rate=drop_rate)(var_qus_lstm) # attention layers var_ctx_flatten = Flatten()( var_ctx_lstm) # shape: (None, ctx_length * hidden_size * 2) var_qus_flatten = Flatten()( var_qus_lstm) # shape: (None, qus_length * hidden_size * 2) var_ctx_repeat = RepeatVector(word_qus_len)( var_ctx_flatten ) # shape: (None, qus_length, ctx_length * hidden_size * 2) var_qus_repeat = RepeatVector(word_ctx_len)( var_qus_flatten ) # shape: (None, ctx_length, qus_length * hidden_size * 2) var_ctx_repeat = Reshape([word_qus_len, word_ctx_len, hidden_size * 2])( var_ctx_repeat ) # shape: (None, qus_length, ctx_length, hidden_size * 2) var_qus_repeat = Reshape([word_ctx_len, word_qus_len, hidden_size * 2])( var_qus_repeat ) # shape: (None, ctx_length, qus_length, hidden_size * 2) var_ctx_repeat = Permute( [2, 1, 3])(var_ctx_repeat ) # shape: (None, ctx_length, qus_length, hidden_size * 2) var_mul_repeat = multiply([ var_ctx_repeat, var_qus_repeat ]) # shape: (None, ctx_length, qus_length, hidden_size * 2) var_sim_repeat = concatenate( [var_ctx_repeat, var_qus_repeat, var_mul_repeat], axis=3) # shape: (None, ctx_length, qus_length, hidden_size * 6) var_sim_sequence = Reshape([word_ctx_len * word_qus_len, hidden_size * 6])( var_sim_repeat ) # shape: (None, ctx_length * qus_length, hidden_size * 6) # dropout ? # var_sim_sequence = Dropout(rate=drop_rate)(var_sim_sequence) var_similarity = TimeDistributed( Dense(units=1), input_shape=(word_ctx_len * word_qus_len, hidden_size * 6))( var_sim_sequence) # shape: (None, ctx_length * qus_length, 1) var_similarity = Reshape([word_ctx_len, word_qus_len])( var_similarity) # shape: (None, ctx_length, qus_length) var_similarity = Activation('relu')(var_similarity) # dropout ? # var_similarity = Dropout(rate=drop_rate)(var_similarity) var_c2qatt_weight = TimeDistributed( Activation('softmax'), input_shape=(word_ctx_len, word_qus_len))( var_similarity) # shape: (None, ctx_length, qus_length) var_c2qatt_ctx = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 1]))( [var_c2qatt_weight, var_qus_lstm]) # shape: (None, ctx_length, hidden_size * 2) var_q2catt_weight = Lambda(lambda x: K.max(x, axis=2))( var_similarity) # shape: (None, ctx_length) var_q2catt_weight = RepeatVector(hidden_size * 2)( var_q2catt_weight) # shape: (None, hidden_size * 2, ctx_length) var_q2catt_weight = Permute([2, 1])( var_q2catt_weight) # shape: (None, ctx_length, hidden_size * 2) var_q2catt_ctx = multiply([var_q2catt_weight, var_ctx_lstm ]) # shape: (None, ctx_length, hidden_size * 2) var_c2qctx_attmul = multiply( [var_ctx_lstm, var_c2qatt_ctx]) # shape: (None, ctx_length, hidden_size * 2) var_q2cctx_attmul = multiply( [var_ctx_lstm, var_q2catt_ctx]) # shape: (None, ctx_length, hidden_size * 2) var_attention = concatenate( [var_ctx_lstm, var_c2qatt_ctx, var_c2qctx_attmul, var_q2cctx_attmul], axis=2) # shape: (None, ctx_length, hidden_size * 8) var_attention = Activation('relu')(var_attention) # # dropout ? # var_attention = Dropout(rate=drop_rate)(var_attention) # model layers var_model1_lstm = Bidirectional( LSTM(units=model_size, recurrent_dropout=recur_drop_rate, return_sequences=True))( var_attention) # shape: (None, ctx_length, model_size * 2) var_model1_att = concatenate( [var_attention, var_model1_lstm], axis=2) # shape: (None, ctx_length, hidden_size * 8 + model_size * 2) # dropout ? # var_model1_att = Dropout(rate=drop_rate)(var_model1_att) var_model2_lstm = Bidirectional( LSTM(units=model_size, recurrent_dropout=recur_drop_rate, return_sequences=True))( var_model1_lstm) # shape: (None, ctx_length, model_size * 2) var_model2_att = concatenate( [var_attention, var_model2_lstm], axis=2) # shape: (None, ctx_length, hidden_size * 8 + model_size * 2) # dropout ? # var_model2_att = Dropout(rate=drop_rate)(var_model2_att) # output layers var_pointer1_weight = TimeDistributed( Dense(units=1), input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))( var_model1_att) # shape: (None, ctx_length, 1) var_pointer1_weight = Flatten()( var_pointer1_weight) # shape: (None, ctx_length) var_pointer1 = Activation('softmax')( var_pointer1_weight) # shape: (None, ctx_length) var_pointer2_weight = TimeDistributed( Dense(units=1), input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))( var_model2_att) # shape: (None, ctx_length, 1) var_pointer2_weight = Flatten()( var_pointer2_weight) # shape: (None, ctx_length) var_pointer2 = Activation('softmax')( var_pointer2_weight) # shape: (None, ctx_length) model = Model(inputs=[ var_x1_input, var_x2_input, var_x3_input, var_x4_input, var_x5_input, var_x6_input ], outputs=[var_pointer1, var_pointer2]) adam = Adam(lr=lr) # # Set loss functions ? # def two_pointers_crossentropy(y_true, y_pred): # p1_true, p1_pred = y_true[0], y_pred[0] # p2_true, p2_pred = y_true[:,1], y_pred[1] # p1_loss = categorical_crops # XXX use multiple loss model.compile( optimizer=adam, loss=['categorical_crossentropy', 'categorical_crossentropy'], loss_weights=[0.5, 0.5], metrics=['accuracy']) et = time.time() print("cost time:", et - st) ## evaluate print("evaluate") st = time.time() model = load_model('model_%s.h5' % model_name, custom_objects={'tf': tf}) # compute predict print("predict") st = time.time() train_Y1_hat, train_Y2_hat = model.predict( [train_X1, train_X2, train_X3, train_X4, train_X5, train_X6], batch_size=batch_size) et = time.time() print("cost time:", et - st) train_Y1_word_pred, train_Y2_word_pred = model_utils.constraint_predict( train_Y1_hat, train_Y2_hat) train_Y1_pred, train_Y2_pred = data_utils.set_char_answer( train_context_words, train_Y1_word_pred, train_Y2_word_pred) train_Y1_pred = np.array(train_Y1_pred, dtype=np.int32) train_Y2_pred = np.array(train_Y2_pred, dtype=np.int32) # evaluate predict with setting answer (word answer) train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy( train_word_ans1, train_Y1_word_pred, train_word_ans2, train_Y2_word_pred) train_prec, train_rec, train_f1 = evaluation.compute_scores( train_word_ans1, train_Y1_word_pred, train_word_ans2, train_Y2_word_pred, word_ctx_len) print("word-level train accuracy:", train_acc1, train_acc2, train_accuracy) print("word-level train prec rec:", train_prec, train_rec) print("word-level train f1:", train_f1) # evaluate predict with real answer (char answer) train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy( train_ans1, train_Y1_pred, train_ans2, train_Y2_pred) train_prec, train_rec, train_f1 = evaluation.compute_scores( train_ans1, train_Y1_pred, train_ans2, train_Y2_pred, max_char_ctx_len) print("char-level train accuracy:", train_acc1, train_acc2, train_accuracy) print("char-level train prec rec:", train_prec, train_rec) print("char-level train f1:", train_f1) et = time.time() print("cost time:", et - st) ## test print("test") st = time.time() test_Y1_hat, test_Y2_hat = model.predict( [test_X1, test_X2, test_X3, test_X4, test_X5, test_X6], batch_size=batch_size) # compute predict test_Y1_word_pred, test_Y2_word_pred = model_utils.constraint_predict( test_Y1_hat, test_Y2_hat) test_Y1_pred, test_Y2_pred = data_utils.set_char_answer( test_context_words, test_Y1_word_pred, test_Y2_word_pred) test_Y1_pred = np.array(test_Y1_pred, dtype=np.int32) test_Y2_pred = np.array(test_Y2_pred, dtype=np.int32) data_utils.write_predict(predict_path, test_id, test_Y1_pred, test_Y2_pred) et = time.time() print("cost time:", et - st)
def get_feed_dict(self, words, fw_words, bw_words, dict_labels, labels=None, lr=None, dropout=None, test_flag=0): """words, fw_words, bw_words, labels, postags, fw_postags, bw_postags Given some data, pad it and build a feed dictionary Args: words: list of sentences. A sentence is a list of ids of a list of words. A word is a list of ids labels: list of ids lr: (float) learning rate dropout: (float) keep prob Returns: dict {placeholder: value} """ # perform padding of the given data if self.config.chars and not self.config.posTag and not self.config.dic_flag and not self.config.morphs: char_ids, word_ids = zip(*words) _, fw_lm_ids = zip(*fw_words) _, bw_lm_ids = zip(*bw_words) word_ids, sequence_lengths = pad_sequences(word_ids, 0) fw_lm_ids, sequence_lengths = pad_sequences(fw_lm_ids, 0) bw_lm_ids, sequence_lengths = pad_sequences(bw_lm_ids, 0) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) elif self.config.chars and self.config.posTag: #--------adding posTag padding----------- if self.config.dic_flag and not self.config.morphs: posTag_ids, char_ids, word_ids, dic_ids = zip(*words) fw_postag_ids, _, fw_lm_ids, _ = zip(*fw_words) bw_postag_ids, _, bw_lm_ids, _ = zip(*bw_words) dic_ids = [] for dict in dict_labels: tmp_dic1 = [] tmp_dic2 = [] tmp_dic3 = [] tmp_dic4 = [] for d_i, d in enumerate(dict['labels1']): tmp_dic1.append(dict['labels1'][d_i]) tmp_dic2.append(dict['labels2'][d_i]) tmp_dic3.append(dict['labels3'][d_i]) tmp_dic4.append(dict['labels4'][d_i]) tmp_dic5.append(dict['labels5'][d_i]) dic_ids.append( [tmp_dic1, tmp_dic2, tmp_dic3, tmp_dic4, tmp_dic5]) elif self.config.dic_flag and self.config.morphs: posTag_ids, char_ids, word_ids, dic_ids, morph_ids, syl_ids = zip( *words) fw_postag_ids, _, fw_lm_ids, _, _, _ = zip(*fw_words) bw_postag_ids, _, bw_lm_ids, _, _, _ = zip(*bw_words) dic_ids = [] for dict in dict_labels: tmp_dic1 = [] tmp_dic2 = [] tmp_dic3 = [] tmp_dic4 = [] tmp_dic5 = [] for d_i, d in enumerate(dict['labels1']): tmp_dic1.append(dict['labels1'][d_i]) tmp_dic2.append(dict['labels2'][d_i]) tmp_dic3.append(dict['labels3'][d_i]) tmp_dic4.append(dict['labels4'][d_i]) tmp_dic5.append(dict['labels5'][d_i]) dic_ids.append( [tmp_dic1, tmp_dic2, tmp_dic3, tmp_dic4, tmp_dic5]) else: posTag_ids, char_ids, word_ids = zip(*words) fw_postag_ids, _, _, fw_lm_ids = zip(*fw_words) bw_postag_ids, _, _, bw_lm_ids = zip(*bw_words) word_ids, sequence_lengths = pad_sequences(word_ids, 0) fw_lm_ids, sequence_lengths = pad_sequences(fw_lm_ids, 0) bw_lm_ids, sequence_lengths = pad_sequences(bw_lm_ids, 0) # if self.config.dic_flag: # dic_ids, sequence_lengths = pad_sequences(dic_ids, 0) if self.config.dic_flag: dic_ids, sequence_lengths = pad_sequences(dic_ids, pad_tok=0, nlevels=4) # if last_flag == False: dic_embeddings = np.zeros((len(word_ids), len(word_ids[0]), 6), dtype=np.float32) # elif last_flag == True: # dic_embeddings = np.zeros((3, len(word_ids[0]), 7), dtype=np.float32) for batch_i, batch_dict in enumerate(dic_ids): for word_i, word_dict in enumerate(batch_dict[0]): dic_embeddings[batch_i][word_i][int( batch_dict[0][word_i])] = 1 dic_embeddings[batch_i][word_i][int( batch_dict[1][word_i])] = 1 dic_embeddings[batch_i][word_i][int( batch_dict[2][word_i])] = 1 dic_embeddings[batch_i][word_i][int( batch_dict[3][word_i])] = 1 dic_embeddings[batch_i][word_i][int( batch_dict[4][word_i])] = 1 dic_ids = dic_embeddings if self.config.morphs: morph_ids, morph_lengths = pad_sequences(morph_ids, pad_tok=0, nlevels=2) syl_ids, syl_lengths = pad_sequences(syl_ids, pad_tok=0, nlevels=2) fw_postag_ids, sequence_lengths = pad_sequences(fw_postag_ids, 0) bw_postag_ids, sequence_lengths = pad_sequences(bw_postag_ids, 0) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) posTag_ids, _ = pad_sequences(posTag_ids, 0) else: word_ids, morph_ids = zip(*words) fw_lm_ids, _ = zip(*fw_words) bw_lm_ids, _ = zip(*bw_words) morph_ids, morph_lengths = pad_sequences(morph_ids, pad_tok=0, nlevels=2) word_ids, sequence_lengths = pad_sequences(word_ids, 0) fw_lm_ids, sequence_lengths = pad_sequences(fw_lm_ids, 0) bw_lm_ids, sequence_lenghts = pad_sequences(bw_lm_ids, 0) # build feed dictionary feed = { self.word_ids: word_ids, self.fw_lm_ids: fw_lm_ids, self.bw_lm_ids: bw_lm_ids, self.sequence_lengths: sequence_lengths } #if test_flag == 1: # print(word_ids) if self.config.posLM: feed[self.fw_pos_ids] = fw_postag_ids feed[self.bw_pos_ids] = bw_postag_ids if self.config.chars: feed[self.char_ids] = char_ids feed[self.word_lengths] = word_lengths if self.config.morphs: feed[self.morph_ids] = morph_ids feed[self.morph_lengths] = morph_lengths feed[self.syl_ids] = syl_ids feed[self.syl_lengths] = syl_lengths if self.config.dic_flag: feed[self.dic_ids] = dic_ids if labels is not None: labels, _ = pad_sequences(labels, 0) feed[self.labels] = labels if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout #postag add-------------------------- if self.config.posTag: feed[self.posTag_ids] = posTag_ids return feed, sequence_lengths
def prepare_data(GLOVE_DIR,TEXT_DATA_DIR,MAX_SEQUENCE_LENGTH,MAX_NB_WORDS ,EMBEDDING_DIM,VALIDATION_SPLIT,categorical=True): """mostly the same preprocessing as in the original post with a couple of differences. sklearn's CountVectorizer is used instead of keras tokenizer and the train/test split is done using sklearn's train_test_split(). """ # build index mapping words in the embeddings set to their embedding # vector print('Indexing word vectors.') embeddings_index = {} f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(embeddings_index)) # second, prepare text samples and their labels print('Processing text dataset') texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids for name in sorted(os.listdir(TEXT_DATA_DIR)): path = os.path.join(TEXT_DATA_DIR, name) if os.path.isdir(path): label_id = len(labels_index) labels_index[name] = label_id for fname in sorted(os.listdir(path)): if fname.isdigit(): fpath = os.path.join(path, fname) if sys.version_info < (3,): f = open(fpath) else: f = open(fpath, encoding='latin-1') t = f.read() i = t.find('\n\n') # skip header if 0 < i: t = t[i:] texts.append(t) f.close() labels.append(label_id) print('Found %s texts.' % len(texts)) # vectorize the text samples into a 2D integer tensor. using sklearn # CountVectorizer instead of the keras tokenizer (as in the original post) # we obtain a slightly higher overlap between the selected words and the # words that have glove vectors. Ultimately this does not make a major # difference processed_texts = [clean_text(t) for t in texts] vectorizer = CountVectorizer(max_features=MAX_NB_WORDS) vectorizer_fit = vectorizer.fit_transform(processed_texts) # We index the words so the most common word ("the") has index 1. This is # irrelevant in reality I do it simply because I wanted to compare with # the dictionary obtained using keras. words = vectorizer.get_feature_names() counts = vectorizer_fit.toarray().sum(axis=0) counts_words = list(zip(counts,words)) counts_words.sort(reverse=True) vocabulary = [str(w[1]) for w in counts_words] word_index = dict(zip(vocabulary, range(MAX_NB_WORDS))) sequences = [] for doc in processed_texts: sequence=[] for word in doc.split(): if word not in word_index: continue sequence.append(word_index[word]) sequences.append(sequence) data = np.vstack([pad_sequences(s,MAX_SEQUENCE_LENGTH) for s in sequences]) labels = np.asarray(labels) # split the data into a training set and a validation set. x_train,x_val,y_train,y_val = train_test_split( data, labels, stratify=labels, test_size=VALIDATION_SPLIT) if categorical: y_train = one_hot(np.asarray(y_train)) y_val = one_hot(np.asarray(y_val)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', y_train.shape) print('Preparing embedding matrix.') # prepare embedding matrix num_words = MAX_NB_WORDS embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return x_train, y_train, x_val, y_val, embedding_matrix
def get_feed_dict(self, words, poss, chunks, labels_aspect=None, labels_polarity=None, labels_joint=None, lr=None, dropout=None, vocab_aspect_tags=None): """ Given some data, pad it and build a feed dictionary Args: words: list of sentences. A sentence is a list of ids of a list of words. A word is a list of ids poss: list of poss_ids chunks: list of chunks_ids labels_aspect: list of labels_aspect_ids lr: (float) learning rate dropout: (float) keep prob Returns: dict {placeholder: value} """ # perform padding of the given data word_ids, sequence_lengths = pad_sequences( words, self.config.n_words, self.config.max_sentence_size) # build feed dictionary feed = { self.word_ids: word_ids, self.sequence_lengths: sequence_lengths } if poss is not None: poss, _ = pad_sequences(poss, self.config.n_poss, self.config.max_sentence_size) feed[self.pos_ids] = poss if chunks is not None: if self.config.use_mpqa: chunks, _ = pad_sequences(chunks, 0, self.config.max_sentence_size) else: chunks, _ = pad_sequences(chunks, self.config.n_chunks, self.config.max_sentence_size) feed[self.chunk_ids] = chunks if self.config.use_labels_length: if labels_aspect is not None and vocab_aspect_tags is not None: labels_average_ = labels_average_length( labels_aspect, vocab_aspect_tags) feed[self.labels_aspect_average_length] = labels_average_ if labels_aspect is not None: labels_aspect, _ = pad_sequences(labels_aspect, 0, self.config.max_sentence_size) feed[self.labels_aspect] = labels_aspect if labels_polarity is not None: labels_polarity, _ = pad_sequences(labels_polarity, 0, self.config.max_sentence_size) feed[self.labels_polarity] = labels_polarity if labels_joint is not None: labels_joint, _ = pad_sequences(labels_joint, 0, self.config.max_sentence_size) feed[self.labels_joint] = labels_joint if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout return feed, sequence_lengths