def init_text_cnn(config): # set number of cores mode = config.get('ENVIRONMENT', 'mode') LogUtil.log('INFO', 'mode=%s' % mode) if 'cpu' == mode: num_cores = config.getint('ENVIRONMENT', 'num_cores') tf_config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, inter_op_parallelism_threads=num_cores, allow_soft_placement=True, device_count={'CPU': num_cores}) session = tf.Session(config=tf_config) K.set_session(session) elif 'gpu' == mode: tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) K.set_session(sess) # load word embedding file word_embedding_fp = '%s/%s' % (config.get( 'DIRECTORY', 'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'word_embedding_fn')) word_embedding_index, word_embedding_matrix = load_embedding( word_embedding_fp) # load char embedding file char_embedding_fp = '%s/%s' % (config.get( 'DIRECTORY', 'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'char_embedding_fn')) char_embedding_index, char_embedding_matrix = load_embedding( char_embedding_fp) # init model title_word_length = config.getint('TITLE_CONTENT_CNN', 'title_word_length') content_word_length = config.getint('TITLE_CONTENT_CNN', 'content_word_length') title_char_length = config.getint('TITLE_CONTENT_CNN', 'title_char_length') content_char_length = config.getint('TITLE_CONTENT_CNN', 'content_char_length') fs_btm_tw_cw_length = config.getint('TITLE_CONTENT_CNN', 'fs_btm_tw_cw_length') fs_btm_tc_length = config.getint('TITLE_CONTENT_CNN', 'fs_btm_tc_length') class_num = config.getint('TITLE_CONTENT_CNN', 'class_num') optimizer_name = config.get('TITLE_CONTENT_CNN', 'optimizer_name') lr = float(config.get('TITLE_CONTENT_CNN', 'lr')) metrics = config.get('TITLE_CONTENT_CNN', 'metrics').split() model = TitleContentCNN(title_word_length=title_word_length, content_word_length=content_word_length, title_char_length=title_char_length, content_char_length=content_char_length, fs_btm_tw_cw_length=fs_btm_tw_cw_length, fs_btm_tc_length=fs_btm_tc_length, class_num=class_num, word_embedding_matrix=word_embedding_matrix, char_embedding_matrix=char_embedding_matrix, optimizer_name=optimizer_name, lr=lr, metrics=metrics) return model, word_embedding_index, char_embedding_index
def predict_val(config, part_id): version = config.get('TITLE_CONTENT_CNN', 'version') LogUtil.log('INFO', 'version=%s' % version) # load word embedding file word_embedding_fp = '%s/%s' % (config.get( 'DIRECTORY', 'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'word_embedding_fn')) word_embedding_index, _ = load_embedding(word_embedding_fp) # load char embedding file char_embedding_fp = '%s/%s' % (config.get( 'DIRECTORY', 'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'char_embedding_fn')) char_embedding_index, _ = load_embedding(char_embedding_fp) # init part_ids part_ids = generate_part_ids(config, part_id) # load offline valid dataset index valid_index_off_fp = '%s/%s.offline.index' % (config.get( 'DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int') valid_index_off = [num - 1 for num in valid_index_off] for part_id in part_ids: LogUtil.log('INFO', 'part_id=%d' % part_id) # load model model_fp = config.get('DIRECTORY', 'model_pt') + 'text_cnn_%03d' % part_id # load json and create model json_file = open('%s.json' % model_fp, 'r') model_json = json_file.read() json_file.close() model = model_from_json(model_json, {'Scale': Scale}) # load weights into new model model.load_weights('%s.h5' % model_fp) LogUtil.log('INFO', 'load model (%s) from disk done' % model_fp) # make predict and evaluation right_label_num = 0 right_label_at_pos_num = [0] * 5 sample_num = 0 all_marked_label_num = 0 precision = 0.0 # save prediction pred_fp = '%s/vote_fs_text_cnn_%s_%d.offline' % (config.get( 'DIRECTORY', 'pred_pt'), version, part_id) pred_all_f = open(pred_fp, 'w') for sub_valid_dataset in data_helpers.load_dataset_from_file_loop( config, 'offline', word_embedding_index, char_embedding_index, valid_index_off, False): sub_valid_preds = model.predict(sub_valid_dataset[:-1], batch_size=32, verbose=True) for i, ps in enumerate(sub_valid_preds): pred_all_f.write('%s\n' % ','.join([str(num) for num in ps])) sample_num += 1 top5_ids = [ x[0] for x in heapq.nlargest( 5, enumerate(ps), key=lambda p: p[1]) ] label_ids = list() for kv in enumerate(sub_valid_dataset[-1][i]): if 1 == kv[1]: label_ids.append(kv[0]) marked_label_set = set(label_ids) all_marked_label_num += len(marked_label_set) for pos, label in enumerate(top5_ids): if label in marked_label_set: right_label_num += 1 right_label_at_pos_num[pos] += 1 pred_all_f.close() for pos, right_num in zip(range(0, 5), right_label_at_pos_num): precision += (right_num / float(sample_num)) / math.log(2.0 + pos) recall = float(right_label_num) / all_marked_label_num LogUtil.log( 'INFO', 'precision=%s, recall=%s, f=%s' % (str(precision), str(recall), str((precision * recall) / (precision + recall))))