def input_fn(self, mode): infile = self._DATA_PATH[mode] data_generator = None # Either it is a CSV file with the original text, and then we need to use the Bert Encoder if infile.endswith('.csv'): data = pd.read_csv(self._DATA_PATH[mode]) data.sort_values(by=['case_id', 'candidate_id'], inplace=True, ascending=[True, True]) data['embeddings'] = [x for x in BertEncoder.encode(data)] def data_generator_from_dataframe(): for id in data['case_id'].unique(): chunk = data[data['case_id'] == id] vecs = chunk['embeddings'] labels = chunk['candidate_is_noticed'].to_numpy() matrix = np.expand_dims(np.vstack(vecs), axis=-1) features_dict = { str(k + 1): matrix[:, k] for k in range(TFRanker._NUM_FEATURES) } yield features_dict, labels data_generator = data_generator_from_dataframe # Or it is a LIBSVM format file, and then we have a reader # Use the modified code, it was shuffling the input data !!! # Use text_2_libsvm.py for generation if infile.endswith('.libsvm'): data_generator = libsvm_generator(infile, TFRanker._NUM_FEATURES, TFRanker._LIST_SIZE) # # dataset = tf.data.Dataset.from_generator( # data_generator, # output_types=({str(k): tf.float32 for k in range(1,TFRanker._NUM_FEATURES+1)}, tf.float32), # output_shapes=({str(k): tf.TensorShape([TFRanker._LIST_SIZE, 1]) for k in range(1,TFRanker._NUM_FEATURES+1)}, tf.TensorShape([TFRanker._LIST_SIZE])) # ) # We don't have big datasets, let's load them once and for all gen = data_generator() all_data = [x for x in gen] X = {} for i in range(TFRanker._NUM_FEATURES): X[str(i + 1)] = np.stack([x[0][str(i + 1)] for x in all_data], axis=0) Y = np.stack([x[1] for x in all_data]) dataset = tf.data.Dataset.from_tensor_slices((X, Y)) if mode == 'train': dataset = dataset.shuffle(300).repeat().batch(self._BATCH_SIZE) else: dataset = dataset.batch(self._BATCH_SIZE) # Queue up a number of batches on the CPU side dataset = dataset.prefetch(8) # # Queue up batches asynchronously onto the GPU # # As long as there is a pool of batches CPU side a GPU prefetch of 1 is sufficient. # gpu = [x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU'] # if len(gpu) == 1: # dataset.apply(tf.data.experimental.prefetch_to_device(gpu[0], buffer_size=8)) #return dataset.make_one_shot_iterator().get_next() return dataset
question2targets_path) judge_samples, _, _ = read_file(drop_judge_dev_path) # judge_template, _, _ = read_file(drop_template_path) max_sequence_len = max( max([len(sample['question']) for sample in judge_samples]), max([len(sample['answer']) for sample in judge_samples])) with open(char_voc_path, 'rb') as infile: char_voc = pickle.load(infile) with open(word_voc_path, 'rb') as infile: word_voc = pickle.load(infile) bert_encoder = BertEncoder(model_root=bert_model_path, bert_config_file=bert_config_file, init_checkpoint=bert_checkpoint_path, vocab_file=bert_voc_path, max_sequence_len=max_sequence_len, embedding_batch=3, embedding_matrix_path=None, sen2id_path=sen2id_path, vec_dim=768) instances_judge_dev = make_instances(judge_samples, char_voc, word_voc, sentiment_words_path, ner_dict_path=ner_dict_path, pos_dict_path=pos_dict_path, use_extra_feature=use_extra_feature, question2targets=question2targets, is_training=False, need_augment=False) # instances_judge_template = make_instances(
def test(): with open('config_judge.json', encoding='utf-8') as infile: config = json.load(infile) int2bool = {1: True, 0: False} sentiment_words_path = config["train_config"]["SENTIMENT_WORDS_PATH"] batch_size = config["train_config"]["BATCH_SIZE"] is_loop = int2bool[config["train_config"]["Is_LOOP"]] is_sort = int2bool[config["train_config"]["IS_SORT"]] dropout_rate = config["train_config"]["DROPOUT_RATE"] nb_classes = config["train_config"]["NB_CLASSES"] attention_dim = config["train_config"]["ATTENTION_DIM"] nb_hops = config["train_config"]["NB_HOPS"] drop_template_path = config["train_config"]["DROP_JUDGE_TEMPLATE_PATH"] use_bert = int2bool[config["train_config"]["USE_BERT"]] optimizer = config["train_config"]["OPTIMIZER"] learning_rate = config["train_config"]["LEARNING_RATE"] grad_clipper = config["train_config"]["GRAD_CLIPPER"] drop_judge_dev_path = config["train_config"]["DROP_JUDGE_DEV_PATH"] best_path = config["train_config"]["BEST_PATH"] question2targets_path = config["train_config"]["QUESTION2TARGETS_PATH"] use_extra_feature = config["train_config"]["USE_EXTRA_FEATURE"] ner_dict_size = config["train_config"]["NER_DICT_SIZE"] pos_dict_size = config["train_config"]["POS_DICT_SIZE"] extra_feature_dim = config["train_config"]["EXTRA_FEATURE_DIM"] ner_dict_path = config["train_config"]["NER_DICT_PATH"] pos_dict_path = config["train_config"]["POS_DICT_PATH"] rnn_dim = config["train_config"]["RNN_DIM"] lambda_l2 = config["train_config"]["LAMBDA_L2"] ans_max_len = config["train_config"]["ANS_MAX_LEN"] que_max_len = config["train_config"]["QUE_MAX_LEN"] sentiment_polarity_multiple = config["train_config"]["POLARITY_MULTIPLE"] use_w2v = True if use_bert: use_w2v = False char_voc_path = config["w2v_config"]["CHAR_VOC_PATH"] char_embedding_matrix_path = config["w2v_config"][ "CHAR_EMBEDDING_MATRIX_PATH"] word_voc_path = config["w2v_config"]["WORD_VOC_PATH"] word_embedding_matrix_path = config["w2v_config"][ "WORD_EMBEDDING_MATRIX_PATH"] bert_model_path = config["bert_config"]["BERT_MODEL_PATH"] bert_config_file = config["bert_config"]["CONFIG_FILE"] bert_checkpoint_path = config["bert_config"]["INIT_CHECKPOINT"] bert_voc_path = config["bert_config"]["VOC_FILE"] sen2id_path = config["bert_config"]["SEN2ID_PATH"] judge_samples, _, _ = read_file(drop_judge_dev_path) judge_template, _, _ = read_file(drop_template_path) max_sequence_len = max( max([len(sample['question']) for sample in judge_samples]), max([len(sample['answer']) for sample in judge_samples])) with open(char_voc_path, 'rb') as infile: char_voc = pickle.load(infile) with open(word_voc_path, 'rb') as infile: word_voc = pickle.load(infile) bert_encoder = BertEncoder(model_root=bert_model_path, bert_config_file=bert_config_file, init_checkpoint=bert_checkpoint_path, vocab_file=bert_voc_path, max_sequence_len=max_sequence_len, embedding_batch=3, embedding_matrix_path=None, sen2id_path=sen2id_path, vec_dim=768) instances_judge_dev = make_instances(judge_samples, char_voc, word_voc, sentiment_words_path, ner_dict_path=ner_dict_path, pos_dict_path=pos_dict_path, use_extra_feature=use_extra_feature, question2targets=question2targets, is_training=False, need_augment=False) instances_judge_dev_with_match_result(instances_judge_dev) data_stream_judge_dev = DataStream(instances=instances_judge_dev, is_shuffle=False, is_loop=is_loop, batch_size=batch_size, ans_max_len=ans_max_len, que_max_len=que_max_len, use_bert=use_bert, bert_encoder=bert_encoder, is_sort=is_sort) with tf.Graph().as_default(): with tf.variable_scope("Model", reuse=False, initializer=tf.glorot_uniform_initializer()): answer_understander_dev = AnswerUnderstander( use_bert=use_bert, use_w2v=use_w2v, rnn_unit='lstm', dropout_rate=dropout_rate, optimizer=optimizer, learning_rate=learning_rate, grad_clipper=grad_clipper, global_step=None, attention_dim=attention_dim, nb_hops=nb_hops, rnn_dim=rnn_dim, lambda_l2=lambda_l2, is_training=False, sentiment_polarity_multiple=sentiment_polarity_multiple, nb_classes=nb_classes, use_extra_feature=use_extra_feature, ner_dict_size=ner_dict_size, pos_dict_size=pos_dict_size, extra_feature_dim=extra_feature_dim, ans_max_len=ans_max_len, que_max_len=que_max_len, char_w2v_embedding_matrix_path=char_embedding_matrix_path, word_w2v_embedding_matrix_path=word_embedding_matrix_path) saver = tf.train.Saver() sess = tf.Session() initializer = tf.global_variables_initializer() sess.run(initializer) saver.restore(sess, best_path) judge_acc = evaluation(sess, answer_understander_dev, data_stream_judge_dev, 'result_{}.txt'.format(loop_index)) print("the final judge accuracy:{}".format(judge_acc)) return judge_acc
INPUT_TRAIN = 'data/text/train_summarized_200.csv' INPUT_EVAL = 'data/text/eval_summarized_200.csv' INPUT_TEST = 'data/text/test_summarized_200.csv' OUTPUT = { INPUT_TRAIN: 'data/libsvm/train_features.libsvm', INPUT_EVAL: 'data/libsvm/eval_features.libsvm', INPUT_TEST: 'data/libsvm/test_features.libsvm' } INPUTS = [INPUT_TEST] for input in INPUTS: data = pd.read_csv(input) data.sort_values(by=['case_id', 'candidate_id'], inplace=True, ascending=[True, True]) data['embeddings'] = [x for x in BertEncoder.encode(data)] with open(OUTPUT[input], 'w') as output: for _, sample in data.iterrows(): vec = sample['embeddings'] label = int(sample['candidate_is_noticed']) case_id = sample['case_id'] candidate_id = sample['candidate_id'] features = ' '.join( ['{}:{}'.format(i + 1, v) for i, v in enumerate(vec)]) output.write('{} qid:{} {} cid:{}\n'.format( label, case_id, features, candidate_id))