def tri_out(filename, thr, output_file='test.txt', config_path='e93.yaml'): """ [Usage] python3 -m algo.ensemble93 main -e mv --build-analysis """ thr = int(thr) config_data = yaml.load(open(config_path)) config = Config(data=config_data) votes_others = load_others_votes(config, [ FINAL, ]) votes_tri = load_tri_votes(config, [ FINAL, ]) labels = list() dataset = Processor.load_origin(filename) with open('out/otri.txt', 'w') as file_obj: for i, (d, v_others, v_tri) in enumerate(zip(dataset, votes_others, votes_tri)): label = d[-1] if label != 0: idx, max_value = argmax(v_tri) if label != idx and max_value >= thr: new_label = label_str[idx] file_obj.write( '{}\t{}\t{}\t{}\t{} ({}->{}, {} {})\n'.format( i, d[0], d[1], d[2], d[-1], label, new_label, v_others, v_tri)) label = idx labels.append(label) export_final(output_file, labels)
def analyse_submit(filename): dataset = Processor.load_origin(filename) label_count = defaultdict(lambda: 0) for res in dataset: label = res[-1] label_count[label] += 1 print(label_count)
def eval_sub(input_filename): dataset = Processor.load_origin(input_filename) labels_predict = list(map(lambda _item: _item[-1], dataset)) labels_gold = load_label_list(data_config.path(FINAL, LABEL)) res = basic_evaluate(gold=labels_gold, pred=labels_predict) print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col))) print()
def diff(a_filename, b_filename, output_filename, config_path='e93.yaml'): config_data = yaml.load(open(config_path)) config = Config(data=config_data) votes = None for output_key in config.others: labels = list() for _mode in modes[FINAL]: path = data_config.output_path(output_key, _mode, LABEL_PREDICT) labels += load_label_list(path) if votes is None: n_sample = len(labels) votes = [0 for _ in range(n_sample)] for i, label in enumerate(labels): if label == 0: votes[i] += 1 dataset = Processor.load_origin(a_filename) labels_a = list(map(lambda _item: _item[-1], dataset)) dataset = Processor.load_origin(b_filename) labels_b = list(map(lambda _item: _item[-1], dataset)) assert len(votes) == len(labels_a) == len(labels_b) n_match = 0 with open(output_filename, 'w') as file_obj: for i, (a, b, d) in enumerate(zip(labels_a, labels_b, dataset)): if a == 3: if b == 0: file_obj.write('{}\t{}\t{}\t{}\t{}->{} ({})\n'.format( i, d[0], d[1], d[2], label_str[a], label_str[b], votes[i])) else: n_match += 1 print(n_match)
def export_error(filename): dataset = Processor.load_origin(filename) path = data_config.path(FINAL, LABEL) gold = load_label_list(path) wrong = defaultdict(lambda: defaultdict(lambda: list())) for g, sample in zip(gold, dataset): p = sample[-1] if p != g: wrong[g][p].append(sample[0] + ' | ' + sample[1] + ' | ' + sample[2]) for _g in range(4): for _p in range(4): print('{}->{}'.format(label_str[_g], label_str[_p])) for sample in wrong[_g][_p]: print('\t{}'.format(sample))
def filter_by_others(input_filename, output_filename, thr, config_path='e93.yaml'): thr = int(thr) config_data = yaml.load(open(config_path)) config = Config(data=config_data) votes = None for output_key in config.others: labels = list() for _mode in modes[FINAL]: path = data_config.output_path(output_key, _mode, LABEL_PREDICT) labels += load_label_list(path) if votes is None: n_sample = len(labels) votes = [0 for _ in range(n_sample)] for i, label in enumerate(labels): if label == 0: votes[i] += 1 dataset = Processor.load_origin(input_filename) labels = list(map(lambda _item: _item[-1], dataset)) assert len(votes) == len(labels) with open(output_filename, 'w') as file_obj: for i, (p, d) in enumerate(zip(labels, dataset)): if p != 0 and votes[i] >= thr: file_obj.write('{}\t{}\t{}\t{}\t{} ({})\n'.format( i, d[0], d[1], d[2], p, votes[i])) labels[i] = 0 export_final('test.txt', labels)
def test_submit(filename_pred, filename_gold): pred = map(lambda _item: _item[-1], Processor.load_origin(filename_pred)) gold = map(lambda _item: _item[-1], Processor.load_origin(filename_gold)) res = basic_evaluate(pred, gold) print_evaluation(res)
def build_basic(): config.prepare_data_folder() # TRAIN labels = list() text_turns = [[] for _ in range(3)] for turn_1, turn_2, turn_3, label_idx in Processor.load_origin_train(): turns = [turn_1, turn_2, turn_3] for i, r in enumerate(turns): text_turns[i].append(r) labels.append(label_idx) for i, texts in enumerate(text_turns): path = config.path(TRAIN, 'turn', str(i)) open(path, 'w').write('\n'.join(texts) + '\n') path = config.path(TRAIN, LABEL) open(path, 'w').write('\n'.join(map(str, labels)) + '\n') binary_labels = [0 if label == 0 else 1 for label in labels] path = config.path(TRAIN, LABEL, 'binary') open(path, 'w').write('\n'.join(map(str, binary_labels)) + '\n') # TEST labels = list() text_turns = [[] for _ in range(3)] for turn_1, turn_2, turn_3, label_idx in Processor.load_origin_dev(): turns = [turn_1, turn_2, turn_3] for i, r in enumerate(turns): text_turns[i].append(r) labels.append(label_idx) for i, texts in enumerate(text_turns): path = config.path(TEST, 'turn', str(i)) open(path, 'w').write('\n'.join(texts) + '\n') path = config.path(TEST, LABEL) open(path, 'w').write('\n'.join(map(str, labels)) + '\n') binary_labels = [0 if label == 0 else 1 for label in labels] path = config.path(TEST, LABEL, 'binary') open(path, 'w').write('\n'.join(map(str, binary_labels)) + '\n') # FINAL labels = list() text_turns = [[] for _ in range(3)] for turn_1, turn_2, turn_3, label_idx in Processor.load_origin_test(): turns = [turn_1, turn_2, turn_3] for i, r in enumerate(turns): text_turns[i].append(r) labels.append(label_idx) for i, texts in enumerate(text_turns): path = config.path(FINAL, 'turn', str(i)) open(path, 'w').write('\n'.join(texts) + '\n') path = config.path(FINAL, LABEL) open(path, 'w').write('\n'.join(map(str, labels)) + '\n') binary_labels = [0 if label == 0 else 1 for label in labels] path = config.path(FINAL, LABEL, 'binary') open(path, 'w').write('\n'.join(map(str, binary_labels)) + '\n')
def main(input_filename, config_path='e93.yaml', final_output=None): """ [Usage] python3 -m algo.ensemble93 main -e mv --build-analysis """ config_data = yaml.load(open(config_path)) config = Config(data=config_data) labels_gold = dict() labels_predict = dict() labels_predict_last = dict() dataset = Processor.load_origin(input_filename) labels_predict[FINAL] = list(map(lambda _item: _item[-1], dataset)) for mode in [ FINAL, ]: if not mode == FINAL: res = basic_evaluate(gold=labels_gold[mode], pred=labels_predict[mode]) print(mode) print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col))) print() n_sample = len(labels_predict[mode]) labels_predict_last[mode] = labels_predict[mode] # 修正HAS if config.tri_enabled: n_changed = 0 votes = [[0 for _ in range(4)] for _ in range(n_sample)] for output_key in config.tri: labels = list() for _mode in modes[mode]: path = data_config.output_path(output_key, _mode, LABEL_PREDICT) labels += load_label_list(path) if len(labels) != n_sample: raise Exception('mismatch {}({}) != {}'.format( output_key, len(labels), n_sample)) for i, label in enumerate(labels): votes[i][label] += 1 base = list() + labels_predict_last[mode] for i, vote in enumerate(votes): arg_max = int(np.argmax(vote)) if arg_max == 0: continue if base[i] != 0: if vote[arg_max] >= config.tri_min_vote: if base[i] != arg_max: n_changed += 1 base[i] = arg_max elif vote[arg_max] >= config.tri_out_vote: base[i] = arg_max n_changed += 1 print('n_exchanged within "HAS": {}'.format(n_changed)) labels_predict_last[mode] = base if not mode == FINAL: res = basic_evaluate(gold=labels_gold[mode], pred=base) print(mode, '(after TRI)') print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col))) print() # 将判成HAS的样本修正为Others if config.others_enabled: votes = [0 for _ in range(n_sample)] n_changed = 0 for output_key in config.others: labels = list() for _mode in modes[mode]: path = data_config.output_path(output_key, _mode, LABEL_PREDICT) labels += load_label_list(path) if len(labels) != n_sample: raise Exception('mismatch {}({}) != {}'.format( output_key, len(labels), n_sample)) for i, label in enumerate(labels): if label == 0: votes[i] += 1 if config.others_min_vote == 'all': min_vote = len(config.others) else: min_vote = int(config.others_min_vote) base = list() + labels_predict_last[mode] for i, vote in enumerate(votes): if vote >= min_vote: if base[i] != 0: n_changed += 1 base[i] = 0 print('n_changed to "OTHERS": {}'.format(n_changed)) labels_predict_last[mode] = base if not mode == FINAL: res = basic_evaluate(gold=labels_gold[mode], pred=base) print(mode, '(after OTHERS)') print_evaluation(res) for col in res[CONFUSION_MATRIX]: print(','.join(map(str, col))) print() if mode == FINAL and final_output is not None: labels = labels_predict_last[FINAL] export_final(final_output, labels)