def m3(config_path='e83.yaml'): """ [Usage] python3 -m algo.ensemble93 main -e mv --build-analysis :param config_path: :return: """ config_data = yaml.load(open(config_path)) config = Config(data=config_data) for mode in [TEST, ]: labels_gold = load_label_list(data_config.path(mode, LABEL, 'B')) b_result = combine(output_keys=config.components(), mode=mode) b_vote = list(map(lambda _item: _item[0], b_result)) b0_result = dict() b0_vote = dict() last_vote = b_vote res = basic_evaluate(gold=labels_gold, pred=last_vote) print('{}'.format(mode)) print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col))) for i in [1, 2, 3]: key = 'b0{}'.format(i) thr = config.thr(key) b0_result[i] = combine(output_keys=config.components(key), mode=mode) new_vote = list() for l_v, b0_res in zip(last_vote, b0_result[i]): this_vote = 0 if b0_res[0] == 0 else i if l_v in {0, i} and b0_res[1] >= thr: new_vote.append(this_vote) else: new_vote.append(l_v) last_vote = new_vote res = basic_evaluate(gold=labels_gold, pred=new_vote) print('{} - {}'.format(mode, i)) print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col))) open('latest_ef83.label', 'w').write('\n'.join(list(map(str, last_vote))))
def main(dataset_key, label_version=None, config_path='config_svm.yaml', kernel='rbf'): """ python algo/svm.py main semeval2018_task3 A :param dataset_key: string :param label_version: string or None :param config_path: string :param kernel: string :return: """ pos_label = None if dataset_key == 'semeval2018_task3' and label_version == 'A': pos_label = 1 config_data = yaml.load(open(config_path)) train_config = Config(data=config_data) data_config = getattr( importlib.import_module('dataset.{}.config'.format(dataset_key)), 'config') datasets = load_dataset(data_config, train_config, label_version) if train_config.use_class_weights: class_weight = 'balanced' else: class_weight = None clf = svm.SVC(class_weight=class_weight, kernel=kernel) #clf = LogisticRegression(C=1., random_state=0, class_weight='balanced') X = datasets[TRAIN][FEATS] clf.fit(X=X, y=datasets[TRAIN][LABEL_GOLD]) if kernel == 'linear': coef = sorted(list(enumerate(clf.coef_.ravel())), key=lambda _item: math.fabs(_item[1])) coef = list(map(lambda _item: _item[0], coef)) print(coef) for mode in [TRAIN, TEST]: X = datasets[mode][FEATS] labels_predict = clf.predict(X=X) labels_gold = datasets[mode][LABEL_GOLD] res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) print(mode) print_evaluation(res) print()
def m3a(target=0, thr=1, config_path='e83a.yaml'): target = int(target) thr = int(thr) config_data = yaml.load(open(config_path)) config = Config(data=config_data) for mode in [TEST, ]: labels_gold = load_label_list(data_config.path(mode, LABEL, 'A')) b_result = combine(output_keys=config.components(), mode=mode) new_vote = list() for r in b_result: if r[0] == target and r[1] >= thr: new_vote.append(target) else: new_vote.append(1 - target) res = basic_evaluate(gold=labels_gold, pred=new_vote) print('{}'.format(mode)) print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col))) last_vote = new_vote output_keys = config.components('b') b_result, counts = combine(output_keys=output_keys, mode=mode, full_output=True) new_vote = list() for count, l_v in zip(counts, last_vote): if count[0] <= 1: new_vote.append(0) else: new_vote.append(l_v) res = basic_evaluate(gold=labels_gold, pred=new_vote) print('{}'.format(mode)) print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col)))
def main(config_path='e83.yaml'): """ [Usage] python3 -m algo.ensemble93 main -e mv --build-analysis :param config_path: :return: """ config_data = yaml.load(open(config_path)) config = Config(data=config_data) for mode in [TRAIN, TEST]: b_result = combine(output_keys=config.components('b'), mode=mode) b_vote = list(map(lambda _item: _item[0], b_result)) b2_result = combine(output_keys=config.components('b2'), mode=mode) b2_vote = list(map(lambda _item: _item[0], b2_result)) last_vote = list() for b_v, b2_v in zip(b_vote, b2_vote): if b_v == 0: label = 0 elif b2_v == 0: label = 1 else: label = 2 last_vote.append(label) b3_result = combine(output_keys=config.components('b3'), mode=mode) b3_vote = list(map(lambda _item: _item[0], b3_result)) labels_predict = list() for last_v, b3_v in zip(last_vote, b3_vote): if last_v != 2: label = last_v elif b3_v == 0: label = 2 else: label = 3 labels_predict.append(label) labels_gold = load_label_list(data_config.path(mode, LABEL, 'B')) res = basic_evaluate(gold=labels_gold, pred=labels_predict) print(mode) print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col)))
def export_error(filename): pred = load_label_list(filename) dataset = Processor.load_origin_test('B') wrong = defaultdict(lambda: defaultdict(lambda: list())) gold = list(map(lambda _item: _item[0], dataset)) res = basic_evaluate(gold=gold, pred=pred) print(res) for p, sample in zip(pred, dataset): g = sample[0] if p != g: wrong[g][p].append(sample[1]) for _g in range(4): for _p in range(4): print('{}->{}'.format(_g, _p)) for sample in wrong[_g][_p]: print('\t{}'.format(sample))
def train(text_version='ek', label_version=None, config_path='c83.yaml'): """ python -m algo.main93_v2 train python3 -m algo.main93_v2 train -c config_ntua93.yaml :param text_version: string :param label_version: string :param config_path: string :return: """ pos_label = 1 if label_version == 'A' else None config_data = yaml.load(open(config_path)) output_key = '{}_{}_{}'.format(NNModel.name, text_version, int(time.time())) if label_version is not None: output_key = '{}_{}'.format(label_version, output_key) print('OUTPUT_KEY: {}'.format(output_key)) # 准备输出路径的文件夹 data_config.prepare_output_folder(output_key=output_key) data_config.prepare_model_folder(output_key=output_key) shutil.copy(config_path, data_config.output_path(output_key, ALL, CONFIG)) w2v_key = '{}_{}'.format(config_data['word']['w2v_version'], text_version) w2v_model_path = data_config.path(ALL, WORD2VEC, w2v_key) vocab_train_path = data_config.path(TRAIN, VOCAB, text_version) # 加载字典集 # 在模型中会采用所有模型中支持的词向量, 并为有足够出现次数的单词随机生成词向量 vocab_meta_list = load_vocab_list(vocab_train_path) vocabs = [ _meta['t'] for _meta in vocab_meta_list if _meta['tf'] >= config_data['word']['min_tf'] ] # 加载词向量与相关数据 lookup_table, vocab_id_mapping, embedding_dim = load_lookup_table2( w2v_model_path=w2v_model_path, vocabs=vocabs) json.dump( vocab_id_mapping, open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING), 'w')) # 加载配置 nn_config = NNConfig(config_data) train_config = TrainConfig(config_data['train']) early_stop_metric = train_config.early_stop_metric # 加载训练数据 datasets = dict() datasets[TRAIN], output_dim = load_dataset( mode=TRAIN, vocab_id_mapping=vocab_id_mapping, max_seq_len=nn_config.seq_len, sampling=train_config.train_sampling, label_version=label_version) datasets[TEST], _ = load_dataset(mode=TEST, vocab_id_mapping=vocab_id_mapping, max_seq_len=nn_config.seq_len, label_version=label_version) # 初始化数据集的检索 index_iterators = { TRAIN: IndexIterator.from_dataset(datasets[TRAIN]), } # 按配置将训练数据切割成训练集和验证集 index_iterators[TRAIN].split_train_valid(train_config.valid_rate) # 计算各个类的权重 if train_config.use_class_weights: label_weight = { # 参考 sklearn 中 class_weight='balanced'的公式, 实验显示效果显着 _label: float(index_iterators[TRAIN].n_sample()) / (index_iterators[TRAIN].dim * len(_index)) for _label, _index in index_iterators[TRAIN].label_index.items() } else: label_weight = { _label: 1. for _label in range(index_iterators[TRAIN].dim) } # 基于加载的数据更新配置 nn_config.set_embedding_dim(embedding_dim) nn_config.set_output_dim(output_dim) # 搭建神经网络 nn = NNModel(config=nn_config) nn.build_neural_network(lookup_table=lookup_table) batch_size = train_config.batch_size fetches = { mode: {_key: nn.var(_key) for _key in fetch_key[mode]} for mode in [TRAIN, TEST] } model_output_prefix = data_config.model_path(key=output_key) + '/model' best_res = {mode: None for mode in [TRAIN, VALID]} no_update_count = {mode: 0 for mode in [TRAIN, VALID]} max_no_update_count = 10 eval_history = {TRAIN: list(), VALID: list(), TEST: list()} with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) dataset = datasets[TRAIN] index_iterator = index_iterators[TRAIN] # 训练开始 ########################################################################## for epoch in range(train_config.epoch): print('== epoch {} = {} ='.format(epoch, output_key)) # 利用训练集进行训练 print('TRAIN') n_sample = index_iterator.n_sample(TRAIN) labels_predict = list() labels_gold = list() for batch_index in index_iterator.iterate(batch_size, mode=TRAIN, shuffle=True): feed_dict = { nn.var(_key): dataset[_key][batch_index] for _key in feed_key[TRAIN] } feed_dict[nn.var(SAMPLE_WEIGHTS)] = list( map(label_weight.get, feed_dict[nn.var(LABEL_GOLD)])) feed_dict[nn.var(TEST_MODE)] = 0 res = sess.run(fetches=fetches[TRAIN], feed_dict=feed_dict) labels_predict += res[LABEL_PREDICT].tolist() labels_gold += dataset[LABEL_GOLD][batch_index].tolist() labels_predict, labels_gold = labels_predict[: n_sample], labels_gold[: n_sample] res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) print_evaluation(res) eval_history[TRAIN].append(res) global_step = tf.train.global_step(sess, nn.var(GLOBAL_STEP)) if train_config.valid_rate == 0.: if best_res[TRAIN] is None or res[ early_stop_metric] > best_res[TRAIN][early_stop_metric]: best_res[TRAIN] = res no_update_count[TRAIN] = 0 saver.save(sess, save_path=model_output_prefix, global_step=global_step) else: no_update_count[TRAIN] += 1 else: if best_res[TRAIN] is None or res[ early_stop_metric] > best_res[TRAIN][early_stop_metric]: best_res[TRAIN] = res no_update_count[TRAIN] = 0 else: no_update_count[TRAIN] += 1 # 计算在验证集上的表现, 不更新模型参数 print('VALID') n_sample = index_iterator.n_sample(VALID) labels_predict = list() labels_gold = list() for batch_index in index_iterator.iterate(batch_size, mode=VALID, shuffle=False): feed_dict = { nn.var(_key): dataset[_key][batch_index] for _key in feed_key[TEST] } feed_dict[nn.var(TEST_MODE)] = 1 res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict) labels_predict += res[LABEL_PREDICT].tolist() labels_gold += dataset[LABEL_GOLD][batch_index].tolist() labels_predict, labels_gold = labels_predict[: n_sample], labels_gold[: n_sample] res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) eval_history[VALID].append(res) print_evaluation(res) # Early Stop if best_res[VALID] is None or res[ early_stop_metric] > best_res[VALID][early_stop_metric]: saver.save(sess, save_path=model_output_prefix, global_step=global_step) best_res[VALID] = res no_update_count[VALID] = 0 else: no_update_count[VALID] += 1 # eval test _mode = TEST _dataset = datasets[_mode] _index_iterator = SimpleIndexIterator.from_dataset(_dataset) _n_sample = _index_iterator.n_sample() labels_predict = list() labels_gold = list() for batch_index in _index_iterator.iterate(batch_size, shuffle=False): feed_dict = { nn.var(_key): _dataset[_key][batch_index] for _key in feed_key[TEST] } feed_dict[nn.var(TEST_MODE)] = 1 res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict) labels_predict += res[LABEL_PREDICT].tolist() labels_gold += _dataset[LABEL_GOLD][batch_index].tolist() labels_predict, labels_gold = labels_predict[: _n_sample], labels_gold[: _n_sample] res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) eval_history[TEST].append(res) print('TEST') print_evaluation(res) if no_update_count[TRAIN] >= max_no_update_count: break # 训练结束 ########################################################################## # 确保输出文件夹存在 print( '========================= BEST ROUND EVALUATION =========================' ) json.dump(eval_history, open(data_config.output_path(output_key, 'eval', 'json'), 'w')) with tf.Session() as sess: prefix_checkpoint = tf.train.latest_checkpoint( data_config.model_path(key=output_key)) saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint)) saver.restore(sess, prefix_checkpoint) nn = BaseNNModel(config=None) nn.set_graph(tf.get_default_graph()) for mode in [TRAIN, TEST]: if mode == TRAIN and train_config.train_sampling: dataset, _ = load_dataset(mode=TRAIN, vocab_id_mapping=vocab_id_mapping, max_seq_len=nn_config.seq_len, sampling=False, label_version=label_version) else: dataset = datasets[mode] index_iterator = SimpleIndexIterator.from_dataset(dataset) n_sample = index_iterator.n_sample() prob_predict = list() labels_predict = list() labels_gold = list() hidden_feats = list() for batch_index in index_iterator.iterate(batch_size, shuffle=False): feed_dict = { nn.var(_key): dataset[_key][batch_index] for _key in feed_key[TEST] } feed_dict[nn.var(TEST_MODE)] = 1 res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict) prob_predict += res[PROB_PREDICT].tolist() labels_predict += res[LABEL_PREDICT].tolist() hidden_feats += res[HIDDEN_FEAT].tolist() if LABEL_GOLD in dataset: labels_gold += dataset[LABEL_GOLD][batch_index].tolist() prob_predict = prob_predict[:n_sample] labels_predict = labels_predict[:n_sample] labels_gold = labels_gold[:n_sample] hidden_feats = hidden_feats[:n_sample] if mode == TEST: res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) best_res[TEST] = res # 导出隐藏层 with open(data_config.output_path(output_key, mode, HIDDEN_FEAT), 'w') as file_obj: for _feat in hidden_feats: file_obj.write('\t'.join(map(str, _feat)) + '\n') # 导出预测的label with open(data_config.output_path(output_key, mode, LABEL_PREDICT), 'w') as file_obj: for _label in labels_predict: file_obj.write('{}\n'.format(_label)) with open(data_config.output_path(output_key, mode, PROB_PREDICT), 'w') as file_obj: for _prob in prob_predict: file_obj.write('\t'.join(map(str, _prob)) + '\n') for mode in [TRAIN, VALID, TEST]: if mode == VALID and train_config.valid_rate == 0.: continue res = best_res[mode] print(mode) print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col))) json.dump( res, open(data_config.output_path(output_key, mode, EVALUATION), 'w')) print() test_score_list = map(lambda _item: _item['f1'], eval_history[TEST]) print('best test f1 reached: {}'.format(max(test_score_list))) print('OUTPUT_KEY: {}'.format(output_key))
def train(dataset_key, text_version, label_version=None, config_path='config.yaml'): """ python algo/main.py train semeval2018_task3 -l A -t ek python algo/main.py train semeval2018_task3 -l A -t ek -c config_ntua.yaml python algo/main.py train semeval2018_task3 -l A -t raw -c config_ntua_char.yaml python algo/main.py train semeval2019_task3_dev -t ek python algo/main.py train semeval2018_task1 -l love python algo/main.py train semeval2014_task9 :param dataset_key: string :param text_version: string :param label_version: string :param config_path: string :return: """ pos_label = None if dataset_key == 'semeval2018_task3' and label_version == 'A': pos_label = 1 config_data = yaml.load(open(config_path)) data_config = getattr( importlib.import_module('dataset.{}.config'.format(dataset_key)), 'config') output_key = '{}_{}_{}'.format(config_data['module'].rsplit('.', 1)[1], text_version, int(time.time())) if label_version is not None: output_key = '{}_{}'.format(label_version, output_key) print('OUTPUT_KEY: {}'.format(output_key)) # 准备输出路径的文件夹 data_config.prepare_output_folder(output_key=output_key) data_config.prepare_model_folder(output_key=output_key) shutil.copy(config_path, data_config.output_path(output_key, ALL, CONFIG)) # 根据配置加载模块 module_relative_path = config_data['module'] NNModel = getattr(importlib.import_module(module_relative_path), 'NNModel') NNConfig = getattr(importlib.import_module(module_relative_path), 'NNConfig') if config_data['analyzer'] == WORD: w2v_key = '{}_{}'.format(config_data['word']['w2v_version'], text_version) w2v_model_path = data_config.path(ALL, WORD2VEC, w2v_key) vocab_train_path = data_config.path(TRAIN, VOCAB, text_version) # 加载字典集 # 在模型中会采用所有模型中支持的词向量, 并为有足够出现次数的单词随机生成词向量 vocab_meta_list = load_vocab_list(vocab_train_path) vocab_meta_list += load_vocab_list( semeval2018_task3_date_config.path(TRAIN, VOCAB, text_version)) vocabs = [ _meta['t'] for _meta in vocab_meta_list if _meta['tf'] >= config_data[WORD]['min_tf'] ] # 加载词向量与相关数据 lookup_table, vocab_id_mapping, embedding_dim = load_lookup_table( w2v_model_path=w2v_model_path, vocabs=vocabs) json.dump( vocab_id_mapping, open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING), 'w')) max_seq_len = MAX_WORD_SEQ_LEN elif config_data['analyzer'] == CHAR: texts = load_text_list(data_config.path(TRAIN, TEXT)) char_set = set() for text in texts: char_set |= set(text) lookup_table, vocab_id_mapping, embedding_dim = build_random_lookup_table( vocabs=char_set, dim=config_data['char']['embedding_dim']) max_seq_len = MAX_CHAR_SEQ_LEN else: raise ValueError('invalid analyzer: {}'.format( config_data['analyzer'])) # 加载训练数据 datasets, output_dim = load_dataset(data_config=data_config, analyzer=config_data['analyzer'], vocab_id_mapping=vocab_id_mapping, seq_len=max_seq_len, with_label=True, label_version=label_version, text_version=text_version) # 加载配置 nn_config = NNConfig(config_data) train_config = TrainConfig(config_data['train']) # 初始化数据集的检索 index_iterators = { mode: IndexIterator(datasets[mode][LABEL_GOLD]) for mode in [TRAIN, TEST] } # 按配置将训练数据切割成训练集和验证集 index_iterators[TRAIN].split_train_valid(train_config.valid_rate) # 计算各个类的权重 if train_config.use_class_weights: label_weight = { # 参考 sklearn 中 class_weight='balanced'的公式, 实验显示效果显着 _label: float(index_iterators[TRAIN].n_sample()) / (index_iterators[TRAIN].dim * len(_index)) for _label, _index in index_iterators[TRAIN].label_index.items() } else: label_weight = { _label: 1. for _label in range(index_iterators[TRAIN].dim) } # 基于加载的数据更新配置 nn_config.set_embedding_dim(embedding_dim) nn_config.set_output_dim(output_dim) nn_config.set_seq_len(max_seq_len) # 搭建神经网络 nn = NNModel(config=nn_config) nn.build_neural_network(lookup_table=lookup_table) batch_size = train_config.batch_size fetches = { mode: {_key: nn.var(_key) for _key in fetch_key[mode]} for mode in [TRAIN, TEST] } last_eval = {TRAIN: None, VALID: None, TEST: None} model_output_prefix = data_config.model_path(key=output_key) + '/model' best_res = {mode: None for mode in [TRAIN, VALID]} no_update_count = {mode: 0 for mode in [TRAIN, VALID]} max_no_update_count = 10 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) dataset = datasets[TRAIN] index_iterator = index_iterators[TRAIN] # 训练开始 ########################################################################## for epoch in range(train_config.epoch): print('== epoch {} =='.format(epoch)) # 利用训练集进行训练 print('TRAIN') n_sample = index_iterator.n_sample(TRAIN) labels_predict = list() labels_gold = list() for batch_index in index_iterator.iterate(batch_size, mode=TRAIN, shuffle=True): feed_dict = { nn.var(_key): dataset[_key][batch_index] for _key in feed_key[TRAIN] } feed_dict[nn.var(SAMPLE_WEIGHTS)] = list( map(label_weight.get, feed_dict[nn.var(LABEL_GOLD)])) feed_dict[nn.var(TEST_MODE)] = 0 res = sess.run(fetches=fetches[TRAIN], feed_dict=feed_dict) labels_predict += res[LABEL_PREDICT].tolist() labels_gold += dataset[LABEL_GOLD][batch_index].tolist() labels_predict, labels_gold = labels_predict[: n_sample], labels_gold[: n_sample] labels_predict, labels_gold = labels_predict[: n_sample], labels_gold[: n_sample] res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) last_eval[TRAIN] = res print_evaluation(res) global_step = tf.train.global_step(sess, nn.var(GLOBAL_STEP)) if train_config.valid_rate == 0.: if best_res[TRAIN] is None or res[F1_SCORE] > best_res[TRAIN][ F1_SCORE]: best_res[TRAIN] = res no_update_count[TRAIN] = 0 saver.save(sess, save_path=model_output_prefix, global_step=global_step) else: no_update_count[TRAIN] += 1 else: if best_res[TRAIN] is None or res[F1_SCORE] > best_res[TRAIN][ F1_SCORE]: best_res[TRAIN] = res no_update_count[TRAIN] = 0 else: no_update_count[TRAIN] += 1 # 计算在验证集上的表现, 不更新模型参数 print('VALID') n_sample = index_iterator.n_sample(VALID) labels_predict = list() labels_gold = list() for batch_index in index_iterator.iterate(batch_size, mode=VALID, shuffle=False): feed_dict = { nn.var(_key): dataset[_key][batch_index] for _key in feed_key[TEST] } feed_dict[nn.var(TEST_MODE)] = 1 res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict) labels_predict += res[LABEL_PREDICT].tolist() labels_gold += dataset[LABEL_GOLD][batch_index].tolist() labels_predict, labels_gold = labels_predict[: n_sample], labels_gold[: n_sample] res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) last_eval[VALID] = res print_evaluation(res) # Early Stop if best_res[VALID] is None or res[F1_SCORE] > best_res[VALID][ F1_SCORE]: saver.save(sess, save_path=model_output_prefix, global_step=global_step) best_res[VALID] = res no_update_count[VALID] = 0 else: no_update_count[VALID] += 1 if no_update_count[TRAIN] >= max_no_update_count: break # 训练结束 ########################################################################## # 确保输出文件夹存在 print( '========================= BEST ROUND EVALUATION =========================' ) with tf.Session() as sess: prefix_checkpoint = tf.train.latest_checkpoint( data_config.model_path(key=output_key)) saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint)) saver.restore(sess, prefix_checkpoint) nn = BaseNNModel(config=None) nn.set_graph(tf.get_default_graph()) for mode in [TRAIN, TEST]: dataset = datasets[mode] index_iterator = index_iterators[mode] n_sample = index_iterator.n_sample() prob_predict = list() labels_predict = list() labels_gold = list() hidden_feats = list() for batch_index in index_iterator.iterate(batch_size, shuffle=False): feed_dict = { nn.var(_key): dataset[_key][batch_index] for _key in feed_key[TEST] } feed_dict[nn.var(TEST_MODE)] = 1 res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict) prob_predict += res[PROB_PREDICT].tolist() labels_predict += res[LABEL_PREDICT].tolist() hidden_feats += res[HIDDEN_FEAT].tolist() labels_gold += dataset[LABEL_GOLD][batch_index].tolist() prob_predict = prob_predict[:n_sample] labels_predict = labels_predict[:n_sample] labels_gold = labels_gold[:n_sample] hidden_feats = hidden_feats[:n_sample] if mode == TEST: res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) best_res[TEST] = res # 导出隐藏层 with open(data_config.output_path(output_key, mode, HIDDEN_FEAT), 'w') as file_obj: for _feat in hidden_feats: file_obj.write('\t'.join(map(str, _feat)) + '\n') # 导出预测的label with open(data_config.output_path(output_key, mode, LABEL_PREDICT), 'w') as file_obj: for _label in labels_predict: file_obj.write('{}\n'.format(_label)) with open(data_config.output_path(output_key, mode, PROB_PREDICT), 'w') as file_obj: for _prob in prob_predict: file_obj.write('\t'.join(map(str, _prob)) + '\n') for mode in [TRAIN, VALID, TEST]: if mode == VALID and train_config.valid_rate == 0.: continue res = best_res[mode] print(mode) print_evaluation(res) json.dump( res, open(data_config.output_path(output_key, mode, EVALUATION), 'w')) print() print('OUTPUT_KEY: {}'.format(output_key))
def logistic_regression(dataset_key, text_version, label_version=None, use_class_weights=True): """ python algo/svm.py lr semeval2018_task3 -t ek -l A """ from sklearn.preprocessing import Normalizer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer data_config = getattr( importlib.import_module('dataset.{}.config'.format(dataset_key)), 'config') pos_label = None if dataset_key == 'semeval2018_task3' and label_version == 'A': pos_label = 1 datasets = dict() for mode in [TRAIN, TEST]: datasets[mode] = { TEXT: load_text_list(data_config.path(mode, TEXT, text_version)), LABEL: load_label_list(data_config.path(mode, LABEL, label_version)) } max_features = 10000 '''vectorizer = TfidfVectorizer( ngram_range=(1, 1), #tokenizer=lambda x: x.split(' '), #tokenizer=lambda x: x.split(' '), analyzer='word', min_df=5, # max_df=0.9, lowercase=False, use_idf=True, smooth_idf=True, max_features=max_features, sublinear_tf=True )''' vectorizer = TfidfVectorizer( ngram_range=(1, 6), analyzer='char', lowercase=False, smooth_idf=True, #sublinear_tf=True, max_features=50000) clf = LogisticRegression(C=1., random_state=0, class_weight='balanced') #clf = svm.SVC(C=0.6, random_state=0, kernel='linear', class_weight='balanced') #clf = svm.SVC(C=0.6, random_state=0, kernel='rbf', class_weight='balanced') pipeline = Pipeline([ ('vectorizer', vectorizer), #('normalizer', Normalizer(norm='l2')), ('clf', clf) ]) pipeline.fit(datasets[TRAIN][TEXT], datasets[TRAIN][LABEL]) for mode in [TRAIN, TEST]: labels_predict = pipeline.predict(datasets[mode][TEXT]) #print(labels_predict) labels_gold = datasets[mode][LABEL] res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) print(mode) print_evaluation(res) print()
def tf_idf(dataset_key, text_version, label_version=None, use_class_weights=True): """ python algo/svm.py tf_idf semeval2018_task3 -t ek -l A """ from sklearn import preprocessing from sklearn.preprocessing import Normalizer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer data_config = getattr( importlib.import_module('dataset.{}.config'.format(dataset_key)), 'config') pos_label = None if dataset_key == 'semeval2018_task3' and label_version == 'A': pos_label = 1 datasets = dict() for mode in [TRAIN, TEST]: datasets[mode] = { TEXT: load_text_list(data_config.path(mode, TEXT, text_version)), LABEL: load_label_list(data_config.path(mode, LABEL, label_version)) } vectorizers = { #TF_IDF: TfidfVectorizer(ngram_range=(1, 3), min_df=0.01), TF: CountVectorizer( #tokenizer=lambda x: filter(lambda _t: not _t.startswith('</'), x.split(' ')), tokenizer=lambda x: x.split(' '), ngram_range=(1, 3), min_df=0.02, max_features=1000), TF_C: TfidfVectorizer(ngram_range=(1, 1), analyzer='char', lowercase=False, smooth_idf=True, sublinear_tf=True) } for key, vectorizer in vectorizers.items(): feat = vectorizer.fit_transform(datasets[TRAIN][TEXT]) datasets[TRAIN][key] = feat feat = vectorizer.transform(datasets[TEST][TEXT]) datasets[TEST][key] = feat if use_class_weights: class_weight = 'balanced' else: class_weight = None clf = svm.SVC(class_weight=class_weight) #clf = LogisticRegression(C=1., random_state=0, class_weight='balanced') X = hstack([datasets[TRAIN][k] for k in vectorizers.keys()]) #scaler = preprocessing.StandardScaler() #X = scaler.fit_transform(X=X.todense()) clf.fit(X=X, y=datasets[TRAIN][LABEL]) for mode in [TRAIN, TEST]: X = hstack([datasets[mode][k] for k in vectorizers.keys()]) #X = scaler.transform(X=X.todense()) labels_predict = clf.predict(X=X) labels_gold = datasets[mode][LABEL] res = basic_evaluate(gold=labels_gold, pred=labels_predict, pos_label=pos_label) print(mode) print_evaluation(res) print()
def main(label_version, ensemble_mode='mv', config_path='e83.yaml', build_analysis=False): """ [Usage] python algo/ensemble.py main -d semeval2018_task3 -l A -e mv python algo/ensemble.py main -d semeval2018_task3 -l A -e mv --build-analysis :param dataset_key: :param label_version: :param ensemble_mode: :param config_path: :param build_analysis: bool :return: """ dataset_key = 'semeval2018_task3' output_key = 'ensemble_{}_{}'.format(ensemble_mode, int(time.time())) pos_label = None if dataset_key == 'semeval2018_task3' and label_version == 'A': pos_label = 1 config_data = yaml.load(open(config_path)) config = Config(data=config_data) data_config = getattr( importlib.import_module('dataset.{}.config'.format(dataset_key)), 'config') labels_predict = dict() labels_gold = dict() n_sample = dict() for mode in [TRAIN, TEST]: label_path = data_config.path(mode, LABEL, label_version) labels_gold[mode] = load_label_list(label_path) n_sample[mode] = len(labels_gold[mode]) output_dim = max(labels_gold[TEST]) + 1 if ensemble_mode == SOFT_VOTING: for mode in [TRAIN, TEST]: components = dict() for output_key in config.components: path = data_config.output_path(output_key, mode, PROB_PREDICT) prob_list = list() with open(path) as file_obj: for line in file_obj: line = line.strip() if line == '': continue prob = list(map(float, line.split('\t'))) prob_list.append(prob) components[output_key] = prob_list labels = list() for i in range(n_sample[mode]): prob = np.zeros((output_dim, )) for output_key, prob_list in components.items(): prob += np.asarray(prob_list[i]) labels.append(np.argmax(prob)) labels_predict[mode] = labels elif ensemble_mode == MAJORITY_VOTING: components = dict() for mode in [TRAIN, TEST]: for output_key in config.components: path = data_config.output_path(output_key, mode, LABEL_PREDICT) label_list = list() with open(path) as file_obj: for line in file_obj: line = line.strip() if line == '': continue label = int(line) label_list.append(label) components[output_key] = label_list labels = list() for i in range(n_sample[mode]): prob = np.zeros((output_dim, )) for output_key, label_list in components.items(): label = label_list[i] prob[label] += 1 labels.append(np.argmax(prob)) labels_predict[mode] = labels elif ensemble_mode == WEIGHTED_MAJORITY_VOTE: raise NotImplementedError else: raise ValueError('unknown mode: {}'.format(ensemble_mode)) for mode in [TRAIN, TEST]: res = basic_evaluate(gold=labels_gold[mode], pred=labels_predict[mode], pos_label=pos_label) print(mode) print_evaluation(res) print() if build_analysis: output_path = data_config.path(mode, ANALYSIS, WRONG_PREDICT) text_list = load_text_list(data_config.path(mode, TEXT)) res = generate_wrong_prediction_report( labels_gold=labels_gold[mode], labels_predict=labels_predict[mode], text_list=text_list) with open(output_path, 'w') as file_obj: file_obj.write('gold\tpredict\ttext') for l_gold, l_predict, t in res: file_obj.write('{} {} {}\n'.format(l_gold, l_predict, t))