Beispiel #1
0
def main(args):
    global verbose, encoding
    verbose = args.verbose
    encoding = args.encoding
    assert args.poly_degree >= 1, '--degree must be positive integer'
    poly_degree = args.poly_degree

    gpu = args.gpu
    if gpu >= 0:
        cuda.check_cuda_available()
        if verbose:
            logger.info('Use GPU {}'.format(gpu))
        cuda.get_device_from_id(gpu).use()

    df = read_dataset(args.path_input, args.flag_has_header)

    # agg = df.groupby('fact_en')['twa'].mean()
    # invalid_facts = set(agg[(agg == 1.0)|(agg == 0.0)].index)
    # if verbose:
    #     logger.info('Invalid facts: {}'.format(len(invalid_facts)))
    # df = df[~df['fact_en'].isin(invalid_facts)]
    # if verbose:
    #     logger.info('Remained {} lines'.format(len(df)))

    # Load vocabulary
    if verbose:
        logger.info('Load vocabulary')
    rel2id = Vocabulary()
    rel2id.read_from_file(args.path_rels)
    fact2id = Vocabulary()
    fact2id.read_from_list(np.unique(get_values(df, 'fact')))
    ja2id = Vocabulary()
    ja2id.read_from_list(np.unique(get_values(df, 'fact_ja')))
    en2id = Vocabulary()
    en2id.read_from_list(np.unique(get_values(df, 'fact_en')))

    df.index = df['fact']
    df.loc[:, 'fact'] = replace_by_dic(df['fact'], fact2id).astype(np.int32)
    df.loc[:, 'fact_ja'] = replace_by_dic(df['fact_ja'], ja2id).astype(np.int32)
    df.loc[:, 'fact_en'] = replace_by_dic(df['fact_en'], en2id).astype(np.int32)
    df.loc[:, 'rel'] = replace_by_dic(df['rel'], rel2id).astype(np.int32)

    en2ja = {en: set(df[df['fact_en'] == en]['fact'].unique())
             for en in sorted(df['fact_en'].unique())}
    idx2vec = get_idx2vec(df, poly_degree=poly_degree)
    if gpu >= 0:
        idx2vec = cuda.to_gpu(idx2vec)

    ss = df.drop_duplicates('fact_en')
    itr = FactIterator(ss, len(ss), ja2id, en2id, train=False, evaluate=True,
                       repeat=False, poly_degree=poly_degree)

    # Define a model
    model_type = args.model.lower()
    dim_in = len(COL_BASIC_FEATURES)
    rel_size = len(rel2id)
    if model_type.startswith('linear'):
        ensembler = LinearEnsembler(dim_in, rel_size, use_gpu=(gpu >= 0),
                                    poly_degree=poly_degree,
                                    flag_unifw=args.flag_unifw,
                                    verbose=verbose)
    elif model_type.startswith('mlp'):
        options = args.model.split(':')
        params = {}
        if len(options) > 1:
            params['dim_hid'] = int(options[1])
        if len(options) > 2:
            params['activation'] = options[2]
        ensembler = MLPEnsembler(
            dim_in, rel_size, use_gpu=(gpu >= 0),
            poly_degree=poly_degree, flag_unifw=args.flag_unifw,
            verbose=verbose, **params)
    else:
        raise ValueError('Invalid --model: {}'.format(model_type))

    ensembler.add_persistent('_mu', None)
    ensembler.add_persistent('_sigma', None)
    # load a trained model
    chainer.serializers.load_npz(args.path_model, ensembler)
    if ensembler._mu is not None:
        logger.info('standardize vectors: True')
        itr.standardize_vectors(mu=ensembler._mu, sigma=ensembler._sigma)
        idx2vec = standardize_vectors(idx2vec, ensembler._mu, ensembler._sigma)
    else:
        logger.info('standardize vectors: False')

    model = Classifier(ensembler, en2ja, idx2vec)

    # calculate probabilities for testing set
    buff = []
    for i, (rels, _, en_indices) in enumerate(itr, start=1):
        if i % 500 == 0:
            logger.info('Evaluating: {}'.format(i))
        buff.append((model(rels, en_indices), en_indices))
    scores = list(chain.from_iterable(t[0] for t in buff))

    if verbose:
        logger.info('Output results to ' + args.path_output)
    with open(args.path_output, 'w') as f:
        header = '\t'.join(['rel', 'start', 'end', 'start_en', 'end_en',
                            'score', 'label'])
        f.write(header + '\n')
        for row in sorted(scores, key=lambda t: t[2], reverse=True):
            idx_fact, idx_en, score = row
            fact = fact2id.id2word[idx_fact]
            fact_ja, fact_en = fact.split('@@@')
            rel, start_en, end_en = fact_en.split('|||')
            rel, start_ja, end_ja = fact_ja.split('|||')
            try:
                label = df.loc[fact, 'label']
            except KeyError:
                label = df.loc[fact, 'twa']
            f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                rel, start_ja, end_ja, start_en, end_en, score, label))
Beispiel #2
0
def main(args):
    global verbose
    verbose = args.verbose

    assert args.poly_degree >= 1, '--degree must be positive integer'
    poly_degree = args.poly_degree

    if verbose:
        report_params(args)

    gpu = args.gpu
    if gpu >= 0:
        cuda.check_cuda_available()
        if verbose:
            logger.info('Use GPU {}'.format(gpu))
        cuda.get_device_from_id(gpu).use()

    set_random_seed(0, use_gpu=(gpu >= 0))

    n_epochs = args.n_epochs
    batch_size = args.batch_size

    # Dataset
    dfs = {}
    dfs['train'] = read_dataset(path.join(args.dir_in, args.filename_train))
    dfs['devel'] = read_dataset(path.join(args.dir_in, args.filename_devel))

    # Load relation vocabulary
    rel2id = Vocabulary()
    rel2id.read_from_file(args.path_rels)

    # Load concept vocabulary
    if verbose:
        logger.info('Load vocabulary')
    fact2id = Vocabulary()
    fact2id.read_from_list(np.unique(get_values(list(dfs.values()), 'fact')))
    ja2id = Vocabulary()
    ja2id.read_from_list(np.unique(get_values(list(dfs.values()), 'fact_ja')))
    en2id = Vocabulary()
    en2id.read_from_list(np.unique(get_values(list(dfs.values()), 'fact_en')))

    if verbose:
        logger.info('Replace facts with indices')
    for col in dfs.keys():
        dfs[col].loc[:, 'fact'] = replace_by_dic(dfs[col]['fact'],
                                                 fact2id).astype(np.int32)
        dfs[col].loc[:, 'fact_ja'] = replace_by_dic(dfs[col]['fact_ja'],
                                                    ja2id).astype(np.int32)
        dfs[col].loc[:, 'fact_en'] = replace_by_dic(dfs[col]['fact_en'],
                                                    en2id).astype(np.int32)
        dfs[col].loc[:, 'rel'] = replace_by_dic(dfs[col]['rel'],
                                                rel2id).astype(np.int32)
    label2fact = {
        i: set(
            np.concatenate(
                [df[df['twa'] == i]['fact'].unique() for df in dfs.values()]))
        for i in [0, 1]
    }
    en2ja = {
        en: set(df[df['fact_en'] == en]['fact'].unique())
        for df in dfs.values() for en in sorted(df['fact_en'].unique())
    }
    idx2vec = get_idx2vec(list(dfs.values()), poly_degree=poly_degree)

    n_facts = len(fact2id)
    n_en = len(en2id)
    n_ja = len(ja2id)
    assert n_facts + 1 == len(
        idx2vec), '{}[n_facts] != {}[len(idx2vec)]'.format(
            n_facts + 1, len(idx2vec))

    if verbose:
        logger.info('Alignment: {}'.format(n_facts))
        logger.info('En: {}'.format(n_en))
        logger.info('Ja: {}'.format(n_ja))
        logger.info('Train: {}'.format(len(dfs['train'])))
        logger.info('Devel: {}'.format(len(dfs['devel'])))

    model_type = args.model.lower()
    dim_in = len(COL_BASIC_FEATURES)
    rel_size = len(rel2id)
    if model_type.startswith('linear'):
        ensembler = LinearEnsembler(dim_in,
                                    rel_size,
                                    use_gpu=(gpu >= 0),
                                    poly_degree=poly_degree,
                                    flag_unifw=args.flag_unifw,
                                    verbose=verbose)
    elif model_type.startswith('mlp'):
        options = args.model.split(':')
        params = {}
        if len(options) > 1:
            params['dim_hid'] = int(options[1])
        if len(options) > 2:
            params['activation'] = options[2]
        ensembler = MLPEnsembler(dim_in,
                                 rel_size,
                                 use_gpu=(gpu >= 0),
                                 poly_degree=poly_degree,
                                 flag_unifw=args.flag_unifw,
                                 verbose=verbose,
                                 **params)
    else:
        raise ValueError('Invalid --model: {}'.format(model_type))

    # Set up a dataset iterator
    train_iter = FactIterator(dfs['train'],
                              args.batch_size,
                              ja2id,
                              en2id,
                              train=True,
                              repeat=True,
                              poly_degree=poly_degree)
    # Only keep positive examples in development set
    df = dfs['devel'][dfs['devel']['twa'] == 1].drop_duplicates('fact_en')

    # Set batch size
    batch_size = find_greatest_divisor(len(df))
    if batch_size == 1 and len(df) <= 10**4:
        batch_size = len(df)
    if verbose:
        logger.info('Devel batch size = {}'.format(batch_size))
    devel_iter = FactIterator(df,
                              batch_size,
                              ja2id,
                              en2id,
                              train=False,
                              repeat=False,
                              poly_degree=poly_degree)

    # Standardize vectors
    if args.flag_standardize:
        mu, sigma = train_iter.standardize_vectors()
        devel_iter.standardize_vectors(mu=mu, sigma=sigma)
        idx2vec = standardize_vectors(idx2vec, mu, sigma)
    else:
        mu, sigma = None, None

    if gpu >= 0:
        idx2vec = cuda.to_gpu(idx2vec)

    # Set up a model
    model = Classifier(ensembler,
                       label2fact,
                       en2ja,
                       idx2vec,
                       margin=args.margin,
                       lam=args.lam)

    if gpu >= 0:
        model.to_gpu(device=gpu)

    # Set up an optimizer
    optimizer = optimizers.AdaGrad(lr=args.lr)
    optimizer.setup(model)

    # Set up a trainer
    updater = Updater(train_iter, optimizer, device=gpu)
    trainer = training.Trainer(updater, (n_epochs, 'epoch'), out=args.dir_out)

    # evaluate development set
    evaluator = Evaluator(devel_iter, model, device=gpu)
    trainer.extend(evaluator)

    # Write out a log
    trainer.extend(extensions.LogReport())
    # Display training status
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss',
            'validation/main/meanrank', 'validation/main/mrr', 'elapsed_time'
        ]))

    if args.save:
        trainer.extend(extensions.snapshot(), trigger=(args.n_epochs, 'epoch'))
        trainer.extend(extensions.snapshot_object(
            ensembler, 'model_iter_{.updater.iteration}'),
                       trigger=(1, 'epoch'))

    # Launch training process
    trainer.run()

    # Report the best score
    (epoch, score) = evaluator.get_best_score()
    if verbose:
        logger.info('Best score: {} (epoch={})'.format(score, epoch))

    # Clean the output directory
    if args.save:
        save_best_model(args.dir_out, ensembler, mu=mu, sigma=sigma)

    del dfs
    del fact2id
    del ja2id
    del en2id

    return score