Ejemplo n.º 1
0
def eval_trained(device, gres: expdata.ResData, model_file_prefix,
                 mentions_file, sents_file, srl_results_file, dep_tags_file,
                 single_type_path, output_preds_file):
    models = load_train_models(device, gres, model_file_prefix)

    all_samples = samples_from_txt(gres.token_id_dict,
                                   gres.unknown_token_id,
                                   gres.type_id_dict,
                                   mentions_file,
                                   sents_file,
                                   dep_tags_file,
                                   srl_results_file,
                                   use_all=True)
    samples_list = __split_samples_by_arg_idx(all_samples)
    sample_type_ids_list = __get_full_type_ids_of_samples(
        gres.parent_type_ids_dict, all_samples)
    true_labels_dict = {
        s[0]: [gres.type_vocab[tid] for tid in type_ids]
        for type_ids, s in zip(sample_type_ids_list, all_samples)
    }
    print([len(samples) for samples in samples_list], 'samples')

    acc, maf1, mif1, result_objs = __eval(gres,
                                          models,
                                          samples_list,
                                          true_labels_dict,
                                          single_type_path=single_type_path)
    print(acc, maf1, mif1)

    datautils.save_json_objs(result_objs, output_preds_file)
    print('results saved to {}'.format(output_preds_file))
Ejemplo n.º 2
0
Archivo: train.py Proyecto: sxrczh/cfet
def train_model():
    if config.dataset == 'cufe':
        data_prefix = config.CUFE_FILES['training_data_prefix']
        if config.train_on_crowd:
            data_prefix = config.CUFE_FILES['crowd_training_data_prefix']

    elif config.dataset == 'ufet':
        data_prefix = config.UFET_FILES['training_data_prefix']

    data_prefix += f'-{config.seq_tokenizer_name}'
    if config.dir_suffix:
        data_prefix += '-' + config.dir_suffix
    if not os.path.isdir(data_prefix): os.mkdir(data_prefix)

    str_today = datetime.datetime.now().strftime('%m_%d_%H%M')
    if not os.path.isdir(join(data_prefix, 'log')):
        os.mkdir(join(data_prefix, 'log'))
    if not config.test:
        log_file = os.path.join(
            join(data_prefix, 'log'),
            '{}-{}.log'.format(str_today, config.model_name))
        print(log_file)
    else:
        log_file = os.path.join(
            config.LOG_DIR, '{}-{}-test.log'.format(str_today,
                                                    config.model_name))
    # if not os.path.isdir(log_file) and not config.test: os.mkdir(log_file)
    init_universal_logging(log_file, mode='a', to_stdout=True)

    save_model_dir = join(data_prefix, 'models')
    if not os.path.isdir(save_model_dir): os.mkdir(save_model_dir)

    gres = exp_utils.GlobalRes(config)

    run_name = f'{config.dataset}-{config.model_name}-{config.seq_tokenizer_name}-{str_today}'
    logging.info(f'run_name: {run_name}')
    logging.info(
        f'/data/cleeag/word_embeddings/{config.mention_tokenizer_name}/{config.mention_tokenizer_name}_tokenizer&vecs.pkl -- loaded'
    )
    logging.info(f'training on {config.dataset}')
    logging.info(
        f'total type count: {len(gres.type2type_id_dict)}, '
        f'general type count: {0 if config.without_general_types else len(gres.general_type_set)}'
    )

    if config.dataset == 'ufet':
        crowd_training_samples = f'{config.CROWD_TRAIN_DATA_PREFIX}-{config.seq_tokenizer_name}.pkl'
        if config.test:

            train_data_pkl = join(data_prefix, 'dev.pkl')
            training_samples = datautils.load_pickle_data(train_data_pkl)
            crowd_training_samples = datautils.load_pickle_data(
                crowd_training_samples)

        else:
            train_data_pkl = join(data_prefix, 'train.pkl')
            print('loading training data {} ...'.format(train_data_pkl),
                  end=' ',
                  flush=True)
            training_samples = datautils.load_pickle_data(train_data_pkl)
            print('done', flush=True)
            logging.info('training data {} -- loaded'.format(train_data_pkl))

            crowd_training_samples = datautils.load_pickle_data(
                crowd_training_samples)

            if config.fine_tune and config.use_bert:
                # training_samples = random.choices(len(training_samples) // 10, training_samples)
                random.shuffle(training_samples)
                training_samples = training_samples[:len(training_samples) //
                                                    10]
                logging.info(
                    f'fining tuning with {len(training_samples)} samples')

        # dev_data_pkl = join(data_prefix, 'dev.pkl')
        # dev_samples = datautils.load_pickle_data(dev_data_pkl)
        dev_json_path = join(data_prefix, 'dev.json')
        dev_samples = exp_utils.model_samples_from_json(dev_json_path)
        dev_true_labels_dict = {
            s['mention_id']: [
                gres.type2type_id_dict.get(x) for x in s['types']
                if x in gres.type2type_id_dict
            ]
            for s in dev_samples
        }

        test_data_pkl = join(data_prefix, 'test.pkl')
        test_samples = datautils.load_pickle_data(test_data_pkl)
        test_true_labels_dict = {
            s['mention_id']: [
                gres.type2type_id_dict.get(x) for x in s['types']
                if x in gres.type2type_id_dict
            ]
            for s in test_samples
        }

    else:
        dev_data_pkl = join(data_prefix, 'dev.pkl')
        test_data_pkl = join(data_prefix, 'test.pkl')
        if config.test:
            train_data_pkl = join(data_prefix, 'dev.pkl')
        else:
            train_data_pkl = join(data_prefix, 'train.pkl')
        # test_data_pkl = config.CUFE_FILES['test_data_file_prefix'] + f'-{config.seq_tokenizer_name}/test.pkl'

        print('loading training data {} ...'.format(train_data_pkl),
              end=' ',
              flush=True)
        training_samples = datautils.load_pickle_data(train_data_pkl)
        print('done', flush=True)
        logging.info('training data {} -- loaded'.format(train_data_pkl))

        if not config.train_on_crowd:
            # crowd_training_samples = f'{config.CROWD_TRAIN_DATA_PREFIX}-{config.seq_tokenizer_name}/train.pkl'
            crowd_training_samples = datautils.load_pickle_data(
                join(data_prefix, 'crowd-train.pkl'))

        print('loading dev data {} ...'.format(dev_data_pkl),
              end=' ',
              flush=True)
        dev_samples = datautils.load_pickle_data(dev_data_pkl)
        print('done', flush=True)
        dev_true_labels_dict = {
            s['mention_id']: [
                gres.type2type_id_dict.get(x)
                for x in exp_utils.general_mapping(s['types'], gres)
            ]
            for s in dev_samples
        }

        test_samples = datautils.load_pickle_data(test_data_pkl)
        test_true_labels_dict = {
            s['mention_id']: [
                gres.type2type_id_dict.get(x)
                for x in exp_utils.general_mapping(s['types'], gres)
            ]
            for s in test_samples
        }

    logging.info(
        f'total training samples: {len(training_samples)}, '
        f'dev samples: {len(dev_samples)}, testing samples: {len(test_samples)}'
    )

    if not config.test:
        result_dir = join(data_prefix, f'{str_today}-results')
        if config.dataset == 'cufe':
            type_scope = 'general_types' if config.only_general_types else 'all_types'
        else:
            type_scope = config.dataset_type
        dev_results_file = join(
            result_dir,
            f'dev-{config.model_name}-{type_scope}-results-{config.inference_threshhold}.txt'
        )
        dev_incorrect_results_file = join(
            result_dir,
            f'dev-{config.model_name}-{type_scope}-incorrect_results-{config.inference_threshhold}.txt'
        )
        test_results_file = join(
            result_dir,
            f'test-{config.model_name}-{type_scope}-results-{config.inference_threshhold}.txt'
        )
        test_incorrect_results_file = join(
            result_dir,
            f'test-{config.model_name}-{type_scope}-incorrect_results-{config.inference_threshhold}.txt'
        )
    else:
        result_dir = join(data_prefix, f'test-results')
        dev_results_file = join(result_dir, f'dev-results.txt')
        dev_incorrect_results_file = join(result_dir,
                                          f'dev-incorrect_results.txt')
        test_results_file = join(result_dir, f'test-results.txt')
        test_incorrect_results_file = join(result_dir,
                                           f'test-incorrect_results.txt')

    if not os.path.isdir(result_dir): os.mkdir(result_dir)

    logging.info(
        'use_bert = {}, use_lstm = {}, use_mlp={}, bert_param_frozen={}, bert_fine_tune={}'
        .format(config.use_bert, config.use_lstm, config.use_mlp,
                config.freeze_bert, config.fine_tune))
    logging.info(
        'type_embed_dim={} contextt_lstm_hidden_dim={} pmlp_hdim={}'.format(
            config.type_embed_dim, config.lstm_hidden_dim,
            config.pred_mlp_hdim))

    # setup training
    device = torch.device(
        f'cuda:{config.gpu_ids[0]}') if torch.cuda.device_count() > 0 else None
    device_name = torch.cuda.get_device_name(config.gpu_ids[0])

    logging.info(f'running on device: {device_name}')
    logging.info('building model...')

    model = fet_model(config, device, gres)
    logging.info(f'transfer={config.transfer}')

    if config.continue_train:
        model_path = config.CONTINUE_TRAINING_PATH[config.continue_train]
        logging.info(f'loading checkpoint from {model_path}')
        trained_weights = torch.load(model_path)
        trained_weights = {
            '.'.join(k.split('.')[1:]): v
            for k, v in trained_weights.items()
        }
        cur_model_dict = model.state_dict()
        cur_model_dict.update(trained_weights)
        model.load_state_dict(cur_model_dict)

    if config.transfer and config.use_lstm:
        logging.info(f'loading checkpoint from {config.TRANSFER_MODEL_PATH}')
        cur_model_dict = model.state_dict()
        trained_weights = torch.load(config.TRANSFER_MODEL_PATH)
        trained_weights_bilstm = {
            '.'.join(k.split('.')[1:]): v
            for k, v in trained_weights.items() if 'bi_lstm' in k
        }
        cur_model_dict.update(trained_weights_bilstm)
        model.load_state_dict(cur_model_dict)

    model.to(device)
    model = torch.nn.DataParallel(model, device_ids=config.gpu_ids)

    batch_size = 32 if config.dataset == 'cufe' and config.train_on_crowd else config.batch_size
    n_iter = 150 if config.dataset == 'cufe' and config.train_on_crowd else config.n_iter
    n_batches = (len(training_samples) + batch_size - 1) // batch_size
    n_steps = n_iter * n_batches
    eval_cycle = config.eval_cycle

    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    losses = list()
    best_dev_acc = -1
    best_maf1_v = -1
    step = 0
    steps_since_last_best = 0

    # start training
    logging.info('{}'.format(model.__class__.__name__))
    logging.info('training batch size: {}'.format(batch_size))
    logging.info(
        '{} epochs, {} steps, {} steps per iter, learning rate={}, lr_decay={}, start training ...'
        .format(n_iter, n_steps, n_batches, config.learning_rate,
                config.lr_gamma))

    while True:
        if step == n_steps:
            break

        batch_idx = step % n_batches
        batch_beg, batch_end = batch_idx * batch_size, min(
            (batch_idx + 1) * batch_size, len(training_samples))
        if config.dataset == 'ufet':
            batch_samples = training_samples[batch_beg:batch_end - batch_size * 2 // 3] \
                            + random.choices(crowd_training_samples, k=batch_size * 1 // 3)
        elif config.dataset == 'cufe':
            if not config.train_on_crowd:
                batch_samples = training_samples[batch_beg:batch_end - batch_size * 2 // 3] \
                                + random.choices(crowd_training_samples, k=batch_size * 1 // 3)
            else:
                batch_samples = training_samples[batch_beg:batch_end]

        try:
            input_dataset, type_vecs = exp_utils.samples_to_tensor(
                config, gres, batch_samples)

            input_dataset = tuple(x.to(device) for x in input_dataset)
            type_vecs = type_vecs.to(device)
            model.module.train()
            logits = model(input_dataset, gres)
        except:
            step += 1
            continue

        if config.dataset == 'ufet':
            loss = model.module.define_loss(logits, type_vecs,
                                            config.dataset_type)
        elif config.GENERAL_TYPES_MAPPING and not config.only_general_types:
            loss = model.module.get_uw_loss(logits, type_vecs, gres)
        else:
            loss = model.module.get_loss(logits, type_vecs)
        optimizer.zero_grad()

        loss.backward()
        if config.use_lstm:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0,
                                           float('inf'))
        optimizer.step()
        losses.append(loss.data.cpu().numpy())
        step += 1
        if step % eval_cycle == 0 and step > 0:
            print('\nevaluating...')
            l_v, acc_v, pacc_v, maf1_v, ma_p_v, ma_r_v, mif1_v, dev_results, incorrect_dev_result_objs = \
                model_utils.eval_fetel(config, gres, model, dev_samples, dev_true_labels_dict)

            l_t, acc_t, pacc_t, maf1_t, ma_p_t, ma_r_t, mif1_t, test_results, incorrect_test_result_objs = \
                model_utils.eval_fetel(config, gres, model, test_samples, test_true_labels_dict)

            if maf1_v > best_maf1_v:
                steps_since_last_best = 0
            best_tag = '*' if maf1_v > best_maf1_v else ''
            logging.info(
                'run name={}, step={}/{}, learning rate={}, losses={:.4f}, steps_since_last_best={}'
                .format(run_name,
                        step, n_steps, optimizer.param_groups[0]['lr'],
                        sum(losses), steps_since_last_best))
            logging.info(
                'dev evaluation result: '
                'l_v={:.4f} acc_v={:.4f} pacc_v={:.4f} macro_f1_v={:.4f} micro_f1_v={:.4f}{}'
                .format(l_v, acc_v, pacc_v, maf1_v, mif1_v, best_tag))
            logging.info(
                f'dev evaluation result: macro_p={ma_p_v:.4f}, macro_r={ma_r_v:.4f}'
            )

            logging.info(
                'test evaluation result: '
                'l_v={:.4f} acc_t={:.4f} pacc_t={:.4f} macro_f1_t={:.4f} micro_f1_t={:.4f}{}'
                .format(l_t, acc_t, pacc_t, maf1_t, mif1_t, best_tag))
            logging.info(
                f'test evaluation result: macro_p={ma_p_t:.4f}, macro_r={ma_r_t:.4f}'
            )

            if maf1_v > best_maf1_v:
                if save_model_dir and not config.test:
                    save_model_file = join(
                        save_model_dir, f'{config.model_name}-{str_today}.pt')
                    torch.save(model.state_dict(), save_model_file)
                    logging.info('model saved to {}'.format(save_model_file))

                logging.info(
                    'prediction result saved to {}'.format(result_dir))
                datautils.save_json_objs(dev_results, dev_results_file)
                datautils.save_json_objs(incorrect_dev_result_objs,
                                         dev_incorrect_results_file)
                datautils.save_json_objs(test_results, test_results_file)
                datautils.save_json_objs(incorrect_test_result_objs,
                                         test_incorrect_results_file)
                # best_dev_acc = acc_v
                best_maf1_v = maf1_v

            losses = list()

        steps_since_last_best += 1
Ejemplo n.º 3
0
def gen_training_data_from_wiki(typed_mentions_file, sents_file, word_vecs_pkl, sample_rate,
                                n_dev_samples, output_files_name_prefix, core_title_wid_file=None):
    np.random.seed(config.RANDOM_SEED)

    core_wids = None
    if core_title_wid_file is not None:
        df = datautils.load_csv(core_title_wid_file)
        core_wids = {wid for _, wid in df.itertuples(False, None)}

    token_vocab, token_vecs = datautils.load_pickle_data(word_vecs_pkl)
    token_id_dict = {t: i for i, t in enumerate(token_vocab)}
    unknown_token_id = token_id_dict[config.TOKEN_UNK]

    f_mention = open(typed_mentions_file, encoding='utf-8')
    f_sent = open(sents_file, encoding='utf-8')
    all_samples = list()
    cur_sent = json.loads(next(f_sent))
    mention_id = 0
    for i, line in enumerate(f_mention):
        if (i + 1) % 1000000 == 0:
            print(i + 1)
        # if i > 400000:
        #     break

        v = np.random.uniform()
        if v > sample_rate:
            continue

        (wid, mention_str, sent_id, pos_beg, pos_end, target_wid, type_ids
         ) = datautils.parse_typed_mention_file_line(line)
        if core_wids is not None and target_wid not in core_wids:
            continue

        mention_str = mention_str.replace('-LRB-', '(').replace('-RRB-', ')')
        while not (cur_sent['wid'] == wid and cur_sent['sent_id'] == sent_id):
            cur_sent = json.loads(next(f_sent))
        sent_tokens = cur_sent['tokens'].split(' ')
        sent_token_ids = [token_id_dict.get(token, unknown_token_id) for token in sent_tokens]

        sample = (mention_id, mention_str, pos_beg, pos_end, target_wid, type_ids, sent_token_ids)
        mention_id += 1
        all_samples.append(sample)
        # print(i, mention_str)
        # print(sent_token_ids)
        # print()
    f_mention.close()
    f_sent.close()

    dev_samples = all_samples[:n_dev_samples]
    train_samples = all_samples[n_dev_samples:]

    print('shuffling ...', end=' ', flush=True)
    rand_perm = np.random.permutation(len(train_samples))
    train_samples_shuffled = list()
    for idx in rand_perm:
        train_samples_shuffled.append(train_samples[idx])
    train_samples = train_samples_shuffled
    print('done')

    dev_mentions, dev_sents = list(), list()
    for i, sample in enumerate(dev_samples):
        mention_id, mention_str, pos_beg, pos_end, target_wid, type_ids, sent_token_ids = sample
        mention = {'mention_id': mention_id, 'span': [pos_beg, pos_end], 'str': mention_str, 'sent_id': i}
        sent = {'sent_id': i, 'text': ' '.join([token_vocab[token_id] for token_id in sent_token_ids]),
                'afet-senid': 0, 'file_id': 'null'}
        dev_mentions.append(mention)
        dev_sents.append(sent)
    datautils.save_json_objs(dev_mentions, output_files_name_prefix + '-dev-mentions.txt')
    datautils.save_json_objs(dev_sents, output_files_name_prefix + '-dev-sents.txt')

    datautils.save_pickle_data(dev_samples, output_files_name_prefix + '-dev.pkl')
    datautils.save_pickle_data(train_samples, output_files_name_prefix + '-train.pkl')
Ejemplo n.º 4
0
def gen_training_data_from_wiki(typed_mentions_file,
                                sents_file,
                                word_vecs_pkl,
                                sample_rate,
                                n_dev_samples,
                                output_files_name_prefix,
                                core_title_wid_file=None,
                                do_bert=False):
    np.random.seed(config.RANDOM_SEED)
    print('output file destination: {}'.format(output_files_name_prefix))

    if do_bert:
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                                  do_lower_case=False)
        # tokenizer.add_special_tokens({'mention': '[MASK]'})

    core_wids = None
    if core_title_wid_file is not None:
        df = datautils.load_csv(core_title_wid_file)
        core_wids = {wid for _, wid in df.itertuples(False, None)}

    print('loading word vec...')
    token_vocab, token_vecs = datautils.load_pickle_data(word_vecs_pkl)
    token_id_dict = {t: i for i, t in enumerate(token_vocab)}
    unknown_token_id = token_id_dict[config.TOKEN_UNK]

    f_mention = open(typed_mentions_file, encoding='utf-8')
    f_sent = open(sents_file, encoding='utf-8')
    all_samples = list()
    cur_sent = json.loads(next(f_sent))
    mention_id = 0
    for i, line in enumerate(f_mention):
        if (i + 1) % 100000 == 0:
            print(i + 1)
        # if i > 40000:
        #     break

        v = np.random.uniform()
        if v > sample_rate:
            continue

        (wid, mention_str, sent_id, pos_beg, pos_end, target_wid,
         type_ids) = datautils.parse_typed_mention_file_line(line)
        if core_wids is not None and target_wid not in core_wids:
            continue

        mention_str = mention_str.replace('-LRB-', '(').replace('-RRB-', ')')
        while not (cur_sent['wid'] == wid and cur_sent['sent_id'] == sent_id):
            cur_sent = json.loads(next(f_sent))
        sent_tokens = cur_sent['tokens'].split(' ')
        sent_token_ids = [
            token_id_dict.get(token, unknown_token_id) for token in sent_tokens
        ]

        if not do_bert:
            sample = (mention_id, mention_str, pos_beg, pos_end, target_wid,
                      type_ids, sent_token_ids)
        else:
            sent_tokens = sent_tokens[:pos_beg] + ['[MASK]'
                                                   ] + sent_tokens[pos_end:]
            full_sent = ' '.join(sent_tokens)
            tokens = ["[CLS]"]
            t = tokenizer.tokenize(full_sent)
            tokens.extend(t)
            mention_token_idx_bert = 0
            for i, x in enumerate(tokens):
                if x == '[MASK]':
                    mention_token_idx_bert = i
                    break
            tokens.append("[SEP]")
            sent_token_bert_ids = tokenizer.convert_tokens_to_ids(tokens)

            sample = (mention_id, mention_str, pos_beg, pos_end, target_wid,
                      type_ids, sent_token_ids, sent_token_bert_ids,
                      mention_token_idx_bert)

        mention_id += 1
        all_samples.append(sample)
        # print(i, mention_str)
        # print(sent_token_ids)
        # print()
        if (i + 1) % 100000 == 0:
            print(i + 1, mention_str)
            print(sent_token_ids)
            print()
            print(sent_token_bert_ids)

    f_mention.close()
    f_sent.close()

    dev_samples = all_samples[:n_dev_samples]
    train_samples = all_samples[n_dev_samples:]

    print('shuffling ...', end=' ', flush=True)
    rand_perm = np.random.permutation(len(train_samples))
    train_samples_shuffled = list()
    for idx in rand_perm:
        train_samples_shuffled.append(train_samples[idx])
    train_samples = train_samples_shuffled
    print('done')

    dev_mentions, dev_sents = list(), list()
    for i, sample in enumerate(dev_samples):
        if do_bert:
            mention_id, mention_str, pos_beg, pos_end, target_wid, type_ids, sent_token_ids, \
            sent_token_bert_ids, mention_token_idx_bert = sample
        else:
            mention_id, mention_str, pos_beg, pos_end, target_wid, type_ids, sent_token_ids = sample
        mention = {
            'mention_id': mention_id,
            'span': [pos_beg, pos_end],
            'str': mention_str,
            'sent_id': i
        }
        sent = {
            'sent_id':
            i,
            'text':
            ' '.join([token_vocab[token_id] for token_id in sent_token_ids]),
            'afet-senid':
            0,
            'file_id':
            'null'
        }
        dev_mentions.append(mention)
        dev_sents.append(sent)
    datautils.save_json_objs(dev_mentions,
                             output_files_name_prefix + '-dev-mentions.txt')
    datautils.save_json_objs(dev_sents,
                             output_files_name_prefix + '-dev-sents.txt')
    print('saving pickle data...')
    datautils.save_pickle_data(dev_samples,
                               output_files_name_prefix + '-dev.pkl')
    datautils.save_pickle_data(train_samples,
                               output_files_name_prefix + '-train.pkl')
Ejemplo n.º 5
0
def train_model(test=False):
    if config.use_gpu:
        device = torch.device(
            'cuda:0') if torch.cuda.device_count() > 0 else torch.device('cpu')
        device_name = torch.cuda.get_device_name(device)
    else:
        device = torch.device('cpu')
        device_name = 'cpu'

    logging.info(f'running on device: {device_name}')
    dataset = 'figer'
    datafiles = config.FIGER_FILES
    word_vecs_file = config.WIKI_FETEL_WORDVEC_FILE
    save_model_file = config.DATA_DIR + 'models' + 'test'

    if config.use_bert:
        data_prefix = datafiles['anchor-train-data-prefix-bert']
    else:
        data_prefix = datafiles['anchor-train-data-prefix']
    # dev_data_pkl = data_prefix + '-dev.pkl'
    # train_data_pkl = data_prefix + '-train.pkl'
    dev_data_pkl = data_prefix + '-dev-slim.pkl'
    if test:
        train_data_pkl = data_prefix + '-dev-slim.pkl'
    else:
        train_data_pkl = data_prefix + '-train-slim.pkl'
    test_results_file = os.path.join(
        config.DATA_DIR, 'Wiki/fetel-deep-results-{}.txt'.format(dataset))

    gres = exp_utils.GlobalRes(datafiles['type-vocab'], word_vecs_file)
    logging.info('dataset={}'.format(dataset))

    logging.info('use_bert = {}, use_lstm = {}, use_mlp={}'.format(
        config.use_bert, config.use_lstm, config.use_mlp))
    logging.info(
        'type_embed_dim={} cxt_lstm_hidden_dim={} pmlp_hdim={}'.format(
            config.type_embed_dim, config.lstm_hidden_dim,
            config.pred_mlp_hdim))
    logging.info('rand_per={} per_pen={}'.format(config.rand_per,
                                                 config.per_penalty))

    print('loading training data {} ...'.format(train_data_pkl),
          end=' ',
          flush=True)
    training_samples = datautils.load_pickle_data(train_data_pkl)
    print('done', flush=True)
    # training_samples = exp_utils.anchor_samples_to_model_samples_bert(config, samples, gres.parent_type_ids_dict)

    print('loading dev data {} ...'.format(dev_data_pkl), end=' ', flush=True)
    dev_samples = datautils.load_pickle_data(dev_data_pkl)
    print('done', flush=True)
    dev_true_labels_dict = {
        s[0]: [
            gres.type_vocab[l]
            for l in utils.get_full_type_ids(s[4], gres.parent_type_ids_dict)
        ]
        for s in dev_samples
    }

    test_samples = exp_utils.model_samples_from_json(
        config, gres.token_id_dict, gres.unknown_token_id, gres.type_id_dict,
        datafiles['fetel-test-mentions'], datafiles['fetel-test-sents'])
    test_true_labels_dict = {
        s[0]: [gres.type_vocab[l] for l in s[4]]
        for s in test_samples
    }

    logging.info('building model...')
    model = fet_model(config, device, gres.type_vocab, gres.type_id_dict,
                      gres.embedding_layer)
    model.to(device)

    logging.info('{}'.format(model.__class__.__name__))
    logging.info('training batch size: {}'.format(config.batch_size))

    # get person penalty vector
    person_type_id = gres.type_id_dict.get('/person')
    l2_person_type_ids = None
    person_loss_vec = None
    if person_type_id is not None:
        l2_person_type_ids = exp_utils.get_l2_person_type_ids(gres.type_vocab)
        person_loss_vec = np.ones(gres.n_types, np.float32)
        for tid in l2_person_type_ids:
            person_loss_vec[tid] = config.per_penalty
        person_loss_vec = torch.tensor(person_loss_vec,
                                       dtype=torch.float32,
                                       device=device)

    n_batches = (len(training_samples) + config.batch_size -
                 1) // config.batch_size
    n_steps = config.n_iter * n_batches
    if config.use_lstm:
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config.learning_rate)
    elif config.use_bert:
        from pytorch_pretrained_bert.optimization import BertAdam
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=config.learning_rate,
                             warmup=config.bert_adam_warmup,
                             t_total=n_steps)
        # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=n_batches,
                                                gamma=config.lr_gamma)
    losses = list()
    best_dev_acc = -1

    # start training
    logging.info(
        '{} steps, {} steps per iter, learning rate={}, lr_decay={}, start training ...'
        .format(config.n_iter * n_batches, n_batches, config.learning_rate,
                config.lr_gamma))
    step = 0
    while True:
        if step == n_steps:
            break

        batch_idx = step % n_batches
        batch_beg, batch_end = batch_idx * config.batch_size, min(
            (batch_idx + 1) * config.batch_size, len(training_samples))
        context_token_list, mention_token_idxs, mstr_token_seqs, type_vecs \
            = exp_utils.samples_to_tensor(
            config, device, gres, training_samples[batch_beg:batch_end],
            person_type_id, l2_person_type_ids)
        model.train()
        logits = model(context_token_list, mention_token_idxs, mstr_token_seqs)
        loss = model.get_loss(type_vecs,
                              logits,
                              person_loss_vec=person_loss_vec)
        scheduler.step()
        optimizer.zero_grad()

        loss.backward()
        if config.use_lstm:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0,
                                           float('inf'))
        optimizer.step()
        losses.append(loss.data.cpu().numpy())
        # logging.info('step={}/{} accumulated loss = {:.4f}, loss = {:.4f}'.format(step, n_steps, sum(losses), loss))

        step += 1

        eval_cycle = 1 if config.test else 100
        if step % eval_cycle == 0:
            l_v, acc_v, pacc_v, _, _, dev_results = \
                model_utils.eval_fetel(config, gres, model, dev_samples, dev_true_labels_dict)

            _, acc_t, pacc_t, maf1, mif1, test_results = \
                model_utils.eval_fetel(config, gres, model, test_samples, test_true_labels_dict)

            best_tag = '*' if acc_v > best_dev_acc else ''
            # logging.info(
            #     'step={}/{} l={:.4f} l_v={:.4f} acc_v={:.4f} paccv={:.4f}{}\n'.format(
            #         step, n_steps, loss, l_v, acc_v, pacc_v, best_tag))
            logging.info('step={}/{}, learning rate={}, losses={:.4f}'.format(
                step, n_steps, optimizer.param_groups[0]['lr'], losses))
            logging.info(
                'evaluation result: '
                'l_v={:.4f} acc_v={:.4f} paccv={:.4f} acc_t={:.4f} macro_f1={:.4f} micro_f1={:.4f}{}\n'
                .format(l_v, acc_v, pacc_v, acc_t, maf1, mif1, best_tag))
            if acc_v > best_dev_acc and save_model_file:
                # torch.save(model.state_dict(), save_model_file)
                logging.info('model saved to {}'.format(save_model_file))

            if test_results_file is not None and acc_v > best_dev_acc:
                datautils.save_json_objs(dev_results, test_results_file)
                logging.info('dev reuslts saved {}'.format(test_results_file))

            if acc_v > best_dev_acc:
                best_dev_acc = acc_v
            losses = list()
            # if config.test:
            #     input('proceed? ')

        pass
Ejemplo n.º 6
0
def train_fetel(device, gres: exputils.GlobalRes, el_entityvec: ELDirectEntityVec, train_samples_pkl,
                dev_samples_pkl, test_mentions_file, test_sents_file, test_noel_preds_file, type_embed_dim,
                context_lstm_hidden_dim, learning_rate, batch_size, n_iter, dropout, rand_per, per_penalty,
                use_mlp=False, pred_mlp_hdim=None, save_model_file=None, nil_rate=0.5,
                single_type_path=False, stack_lstm=False, concat_lstm=False, results_file=None):
    logging.info('result_file={}'.format(results_file))
    logging.info(
        'type_embed_dim={} cxt_lstm_hidden_dim={} pmlp_hdim={} nil_rate={} single_type_path={}'.format(
            type_embed_dim, context_lstm_hidden_dim, pred_mlp_hdim, nil_rate, single_type_path))
    logging.info('rand_per={} per_pen={}'.format(rand_per, per_penalty))
    logging.info('stack_lstm={} cat_lstm={}'.format(stack_lstm, concat_lstm))

    if stack_lstm:
        model = FETELStack(
            device, gres.type_vocab, gres.type_id_dict, gres.embedding_layer, context_lstm_hidden_dim,
            type_embed_dim=type_embed_dim, dropout=dropout, use_mlp=use_mlp, mlp_hidden_dim=pred_mlp_hdim,
            concat_lstm=concat_lstm)
    else:
        model = None
    if device.type == 'cuda':
        model = model.cuda(device.index)

    train_samples = datautils.load_pickle_data(train_samples_pkl)

    dev_samples = datautils.load_pickle_data(dev_samples_pkl)
    dev_samples = anchor_samples_to_model_samples(dev_samples, gres.mention_token_id, gres.parent_type_ids_dict)

    lr_gamma = 0.7
    eval_batch_size = 32
    logging.info('{}'.format(model.__class__.__name__))
    dev_true_labels_dict = {s.mention_id: [gres.type_vocab[l] for l in s.labels] for s in dev_samples}
    dev_entity_vecs, dev_el_sgns, dev_el_probs = __get_entity_vecs_for_samples(el_entityvec, dev_samples, None)

    test_samples = model_samples_from_json(gres.token_id_dict, gres.unknown_token_id, gres.mention_token_id,
                                           gres.type_id_dict, test_mentions_file, test_sents_file)
    test_noel_pred_results = datautils.read_pred_results_file(test_noel_preds_file)

    test_mentions = datautils.read_json_objs(test_mentions_file)
    test_entity_vecs, test_el_sgns, test_el_probs = __get_entity_vecs_for_mentions(
        el_entityvec, test_mentions, test_noel_pred_results, gres.n_types)

    test_true_labels_dict = {m['mention_id']: m['labels'] for m in test_mentions} if (
            'labels' in next(iter(test_mentions))) else None

    person_type_id = gres.type_id_dict.get('/person')
    l2_person_type_ids, person_loss_vec = None, None
    if person_type_id is not None:
        l2_person_type_ids = __get_l2_person_type_ids(gres.type_vocab)
        person_loss_vec = exputils.get_person_type_loss_vec(
            l2_person_type_ids, gres.n_types, per_penalty, model.device)

    dev_results_file = None
    n_batches = (len(train_samples) + batch_size - 1) // batch_size
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=n_batches, gamma=lr_gamma)
    losses = list()
    best_dev_acc = -1
    logging.info('{} steps, {} steps per iter, lr_decay={}, start training ...'.format(
        n_iter * n_batches, n_batches, lr_gamma))
    step = 0
    n_steps = n_iter * n_batches
    while step < n_steps:
        batch_idx = step % n_batches
        batch_beg, batch_end = batch_idx * batch_size, min((batch_idx + 1) * batch_size, len(train_samples))
        batch_samples = anchor_samples_to_model_samples(
            train_samples[batch_beg:batch_end], gres.mention_token_id, gres.parent_type_ids_dict)
        if rand_per:
            entity_vecs, el_sgns, el_probs = __get_entity_vecs_for_samples(
                el_entityvec, batch_samples, None, True, person_type_id, l2_person_type_ids, gres.type_vocab)
        else:
            entity_vecs, el_sgns, el_probs = __get_entity_vecs_for_samples(el_entityvec, batch_samples, None, True)

        use_entity_vecs = True
        model.train()

        (context_token_seqs, mention_token_idxs, mstrs, mstr_token_seqs, y_true
         ) = exputils.get_mstr_cxt_label_batch_input(model.device, gres.n_types, batch_samples)

        if use_entity_vecs:
            for i in range(entity_vecs.shape[0]):
                if np.random.uniform() < nil_rate:
                    entity_vecs[i] = np.zeros(entity_vecs.shape[1], np.float32)
            el_probs = torch.tensor(el_probs, dtype=torch.float32, device=model.device)
            entity_vecs = torch.tensor(entity_vecs, dtype=torch.float32, device=model.device)
        else:
            entity_vecs = None
        logits = model(context_token_seqs, mention_token_idxs, mstr_token_seqs, entity_vecs, el_probs)
        loss = model.get_loss(y_true, logits, person_loss_vec=person_loss_vec)
        scheduler.step()
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0, float('inf'))
        optimizer.step()
        losses.append(loss.data.cpu().numpy())

        step += 1
        if step % 1000 == 0:
            # logging.info('i={} l={:.4f}'.format(step + 1, sum(losses)))
            acc_v, pacc_v, _, _, dev_results = eval_fetel(
                gres, model, dev_samples, dev_entity_vecs, dev_el_probs, eval_batch_size,
                use_entity_vecs=use_entity_vecs, single_type_path=single_type_path,
                true_labels_dict=dev_true_labels_dict)
            acc_t, _, maf1, mif1, test_results = eval_fetel(
                gres, model, test_samples, test_entity_vecs, test_el_probs, eval_batch_size,
                use_entity_vecs=use_entity_vecs, single_type_path=single_type_path,
                true_labels_dict=test_true_labels_dict)

            best_tag = '*' if acc_v > best_dev_acc else ''
            logging.info(
                'i={} l={:.4f} accv={:.4f} paccv={:.4f} acct={:.4f} maf1={:.4f} mif1={:.4f}{}'.format(
                    step, sum(losses), acc_v, pacc_v, acc_t, maf1, mif1, best_tag))
            if acc_v > best_dev_acc and save_model_file:
                torch.save(model.state_dict(), save_model_file)
                logging.info('model saved to {}'.format(save_model_file))

            if dev_results_file is not None and acc_v > best_dev_acc:
                datautils.save_json_objs(dev_results, dev_results_file)
                logging.info('dev reuslts saved {}'.format(dev_results_file))
            if results_file is not None and acc_v > best_dev_acc:
                datautils.save_json_objs(test_results, results_file)
                logging.info('test reuslts saved {}'.format(results_file))

            if acc_v > best_dev_acc:
                best_dev_acc = acc_v
            losses = list()