def load_dataset_from_file_loop(config, data_name, word_emb_index, char_emb_index, inds, loop=True): version = config.get('TITLE_CONTENT_CNN', 'version') LogUtil.log('INFO', 'version=%s' % version) data_loader = __import__('bin.text_cnn.%s.data_loader' % version, fromlist=["*"]) part_size = config.getint('TITLE_CONTENT_CNN', 'part_size') inds_len = len(inds) inds_index = 0 sub_inds = list() while True: if inds_len <= inds_index: if loop: inds_index = 0 random.shuffle(inds) else: break sub_inds.append(inds[inds_index]) inds_index += 1 if (part_size == len(sub_inds)) or (inds_len <= inds_index): # delete duplicate sub_inds = reduce(lambda x, y: x if y in x else x + [y], [[], ] + sub_inds) yield data_loader.load_dataset_from_file(config, data_name, word_emb_index, char_emb_index, sub_inds) sub_inds = list()
def load_feature_vec_part(file_path, inds_copy, inds_map): vecs = [0] * len(inds_copy) index_f = 0 index_inds = 0 is_smat = isfile('%s.smat' % file_path) if is_smat: LogUtil.log('INFO', 'load sparse feature file %s' % file_path) f = open('%s.smat' % file_path, 'r') row_num, col_num = re.split(' |,', f.readline().strip('\n')) row_num = int(row_num) col_num = int(col_num) else: LogUtil.log('INFO', 'load dense feature file %s' % file_path) f = open(file_path, 'r') row_num = col_num = -1 for line in f: if len(inds_copy) <= index_inds: break if index_f == inds_copy[index_inds]: vecs[inds_map[index_inds]] = parse_feature_vec(line) if not is_smat else parse_feature_sparse_vec(line, col_num) index_inds += 1 index_f += 1 f.close() return vecs
def F(preds, labels): topk = 5 right_label_num = 0 right_label_at_pos_num = [0] * 5 sample_num = 0 all_marked_label_num = 0 for i, ps in enumerate(preds): sample_num += 1 top5_ids = [x[0] for x in heapq.nlargest(5, enumerate(ps), key=lambda p: p[1])] label_ids = list() for kv in enumerate(labels[i]): if 1 == kv[1]: label_ids.append(kv[0]) marked_label_set = set(label_ids) all_marked_label_num += len(marked_label_set) for pos, label in enumerate(top5_ids): if label in marked_label_set: right_label_num += 1 right_label_at_pos_num[pos] += 1 precision = 0.0 for pos, right_num in zip(range(0, topk), right_label_at_pos_num): precision += (right_num / float(sample_num)) / math.log(2.0 + pos) recall = float(right_label_num) / all_marked_label_num LogUtil.log('INFO', 'precision=%s, recall=%s, f=%s' % (str(precision), str(recall), str((precision * recall) / (precision + recall))))
def init_text_cnn(config): # set number of cores mode = config.get('ENVIRONMENT', 'mode') LogUtil.log('INFO', 'mode=%s' % mode) if 'cpu' == mode: num_cores = config.getint('ENVIRONMENT', 'num_cores') tf_config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, inter_op_parallelism_threads=num_cores, allow_soft_placement=True, device_count={'CPU': num_cores}) session = tf.Session(config=tf_config) K.set_session(session) elif 'gpu' == mode: tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) K.set_session(sess) # load word embedding file word_embedding_fp = '%s/%s' % (config.get( 'DIRECTORY', 'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'word_embedding_fn')) word_embedding_index, word_embedding_matrix = load_embedding( word_embedding_fp) # load char embedding file char_embedding_fp = '%s/%s' % (config.get( 'DIRECTORY', 'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'char_embedding_fn')) char_embedding_index, char_embedding_matrix = load_embedding( char_embedding_fp) # init model title_word_length = config.getint('TITLE_CONTENT_CNN', 'title_word_length') content_word_length = config.getint('TITLE_CONTENT_CNN', 'content_word_length') title_char_length = config.getint('TITLE_CONTENT_CNN', 'title_char_length') content_char_length = config.getint('TITLE_CONTENT_CNN', 'content_char_length') fs_btm_tw_cw_length = config.getint('TITLE_CONTENT_CNN', 'fs_btm_tw_cw_length') fs_btm_tc_length = config.getint('TITLE_CONTENT_CNN', 'fs_btm_tc_length') class_num = config.getint('TITLE_CONTENT_CNN', 'class_num') optimizer_name = config.get('TITLE_CONTENT_CNN', 'optimizer_name') lr = float(config.get('TITLE_CONTENT_CNN', 'lr')) metrics = config.get('TITLE_CONTENT_CNN', 'metrics').split() model = TitleContentCNN(title_word_length=title_word_length, content_word_length=content_word_length, title_char_length=title_char_length, content_char_length=content_char_length, fs_btm_tw_cw_length=fs_btm_tw_cw_length, fs_btm_tc_length=fs_btm_tc_length, class_num=class_num, word_embedding_matrix=word_embedding_matrix, char_embedding_matrix=char_embedding_matrix, optimizer_name=optimizer_name, lr=lr, metrics=metrics) return model, word_embedding_index, char_embedding_index
def load(self, model_fp): # load json and create model json_file = open('%s.json' % model_fp, 'r') model_json = json_file.read() json_file.close() self._model = model_from_json(model_json) # load weights into new model self._model.load_weights('%s.h5' % model_fp) LogUtil.log('INFO', 'load model (%s) from disk done' % model_fp)
def load_feature_vec(file_path): if isfile(file_path + '.smat'): LogUtil.log('INFO', 'load sparse feature file %s' % file_path) f = open(file_path + '.smat', 'r') row_num, col_num = re.split(' |,', f.readline().strip('\n')) return [parse_feature_sparse_vec(line, int(col_num)) for line in f.readlines()] else: LogUtil.log('INFO', 'load dense feature file %s' % file_path) return [parse_feature_vec(line) for line in open(file_path).readlines()]
def load_raw_line_from_file(config, file_path, inds): # make a copy of index inds_sorted = sorted(enumerate(inds), key=lambda kv: kv[1]) inds_copy = [kv[1] for kv in inds_sorted] inds_map = [kv[0] for kv in inds_sorted] sub_lines = load_raw_line_part(file_path, inds_copy, inds_map) LogUtil.log('INFO', 'load raw line done') return sub_lines
def load_features_from_file(config, feature_name, data_name, inds): # make a copy of index inds_sorted = sorted(enumerate(inds), key=lambda kv: kv[1]) inds_copy = [kv[1] for kv in inds_sorted] inds_map = [kv[0] for kv in inds_sorted] # load features feature_fp = '%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), feature_name, data_name) sub_features = load_feature_vec_part(feature_fp, inds_copy, inds_map) LogUtil.log('INFO', 'len(sub_features)=%d' % len(sub_features)) sub_features = np.asarray(sub_features, dtype='float32') LogUtil.log('INFO', 'load features done') return sub_features
def load_labels_from_file(config, data_name, inds): # make a copy of index inds_sorted = sorted(enumerate(inds), key=lambda kv: kv[1]) inds_copy = [kv[1] for kv in inds_sorted] inds_map = [kv[0] for kv in inds_sorted] # load label id vectors lid_fp = None if 'online' == data_name \ else '%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), 'label_id', data_name) class_num = config.getint('TITLE_CONTENT_CNN', 'class_num') sub_lid_vecs = None if lid_fp is None else np.asarray(load_lid_part(lid_fp, class_num, inds_copy, inds_map), dtype='int32') LogUtil.log('INFO', 'load label id vector done') return sub_lid_vecs
def load_embedding_with_idx(file_path, emb_index): emb_f = open(file_path, 'r') shape = emb_f.readline().strip() emb_num, emb_size = [int(x) for x in shape.split()] LogUtil.log('INFO', 'embedding_shape=(%d, %d)' % (emb_num, emb_size)) emb_matrix = np.zeros([emb_num+2, emb_size]) for line in emb_f: subs = line.strip().split() word = subs[0] vec = subs[1:] if word in emb_index: emb_matrix[emb_index[word]] = np.asarray(vec) return emb_matrix
def load_embedding(file_path): emb_f = open(file_path, 'r') shape = emb_f.readline().strip() emb_num, emb_size = [int(x) for x in shape.split()] LogUtil.log('INFO', 'embedding_shape=(%d, %d)' % (emb_num, emb_size)) emb_index = {} emb_matrix = [['0.'] * emb_size, ['0.'] * emb_size] for line in emb_f: subs = line.strip().split() word = subs[0] vec = subs[1:] emb_index[word] = len(emb_matrix) emb_matrix.append(vec) emb_matrix = np.asarray(emb_matrix, dtype='float32') return emb_index, emb_matrix
def F_by_fuck_zhihu_eng(preds, labels): topk = 5 top5_labels = list() for i, ps in enumerate(preds): top5 = enumerate(ps) top5 = sorted(top5, key=lambda s:s[1], reverse=True) top5_ids = [x[0] for x in top5[:5]] label_ids = list() for kv in enumerate(labels[i]): if 1 == kv[1]: label_ids.append(kv[0]) top5_labels.append([top5_ids, label_ids]) right_label_num = 0 right_label_at_pos_num = [0 for i in range(50)] sample_num = 0 all_marked_label_num = 0 for predict_labels, marked_labels in top5_labels: sample_num += 1 marked_label_set = set(marked_labels) all_marked_label_num += len(marked_label_set) for pos, label in zip(range(0, min(len(predict_labels), topk)), predict_labels): if label in marked_label_set: right_label_num += 1 right_label_at_pos_num[pos] += 1 precision = 0.0 for pos, right_num in zip(range(0, topk), right_label_at_pos_num): precision += (right_num / float(sample_num)) / math.log(2.0 + pos) recall = float(right_label_num) / all_marked_label_num LogUtil.log('INFO', 'precision=%s, recall=%s, f=%s' % (str(precision), str(recall), str((precision * recall) / (precision + recall))))
def F_by_ids(ids, labels): topk = 5 right_label_num = 0 right_label_at_pos_num = [0] * 5 sample_num = 0 all_marked_label_num = 0 for i, top5_ids in enumerate(ids): top5_ids = top5_ids[:5] sample_num += 1 label_ids = list() for kv in enumerate(labels[i]): if 1 == kv[1]: label_ids.append(kv[0]) marked_label_set = set(label_ids) all_marked_label_num += len(marked_label_set) for pos, label in enumerate(top5_ids): if label in marked_label_set: right_label_num += 1 right_label_at_pos_num[pos] += 1 precision = 0.0 for pos, right_num in zip(range(0, topk), right_label_at_pos_num): precision += (right_num / float(sample_num)) / math.log(2.0 + pos) recall = float(right_label_num) / all_marked_label_num f = (precision * recall) / (precision + recall) LogUtil.log('INFO', 'precision=%s, recall=%s, f=%s' % (str(precision), str(recall), str(f))) return f
def save(self, model_fp): model_json = self._model.to_json() with open('%s.json' % model_fp, 'w') as json_file: json_file.write(model_json) self._model.save_weights('%s.h5' % model_fp) LogUtil.log('INFO', 'save model (%s) to disk done' % model_fp)