Example #1
0
def experiment(config: DictConfig):
    train_in_path = hydra.utils.to_absolute_path(config.data.raw_train_path)
    test_in_path = hydra.utils.to_absolute_path(config.data.raw_test_path)
    test_score_path = hydra.utils.to_absolute_path(
        config.data.raw_test_score_path)
    train_out_path = hydra.utils.to_absolute_path(config.data.train_path)
    test_out_path = hydra.utils.to_absolute_path(config.data.test_path)

    train_df = prepare_train(train_in_path, train_out_path)
    test_df = prepare_test(test_in_path, test_score_path, test_out_path)

    model = train_model(config)

    train_df['similarity'] = sentence_similarity(model, train_df)
    test_df['similarity'] = sentence_similarity(model, test_df)

    train_correlation = pearson_correlation(train_df.similarity,
                                            train_df.score)
    test_correlation = pearson_correlation(test_df.similarity, test_df.score)

    train_chart = plot_scatter(train_df, train_correlation)
    train_chart.save('train_chart.json')

    test_chart = plot_scatter(test_df, test_correlation)
    test_chart.save('test_chart.json')

    logger.info('Encoder %s', config.model.encoder)
    logger.info('Pooling %s', config.model.pooling)
    logger.info('Normalizer %s', config.model.normalizer)
    logger.info('MedSTS Train correlation %s', train_correlation)
    logger.info('MedSTS Test correlation %s', test_correlation)
def experiment(config: DictConfig):
    df = pd.read_csv(hydra.utils.to_absolute_path(config.data.raw_path))
    df.columns = ['pair_id', 'sentence_1', 'sentence_2', 'a_1', 'a2', 'a3', 'a4', 'a5', 'score']

    df['score_bin'] = df.score.map(lambda x: int(x))
    results = []
    cross_validation = StratifiedKFold(n_splits=10, random_state=config.data.random_state)

    for i, (train_index, test_index) in enumerate(cross_validation.split(df, df.score_bin.values)):
        train_df = df.iloc[train_index]
        test_df = df.iloc[test_index]

        to_file(train_df, hydra.utils.to_absolute_path(config.data.train_path))
        to_file(test_df, hydra.utils.to_absolute_path(config.data.test_path))

        model = train_model(config)

        train_df['similarity'] = sentence_similarity(model, train_df)
        test_df['similarity'] = sentence_similarity(model, test_df)

        train_correlation = pearson_correlation(train_df.similarity, train_df.score)
        test_correlation = pearson_correlation(test_df.similarity, test_df.score)
        logger.info('Cross Validation Split %s', i)
        logger.info('Train correlation %s', train_correlation)
        logger.info('Test correlation %s', test_correlation)
        results.append({'train': train_correlation, 'test': test_correlation})

    result_df = pd.DataFrame(results)

    logger.info('Encoder %s', config.model.encoder)
    logger.info('Pooling %s', config.model.pooling)
    logger.info('Normalizer %s', config.model.normalizer)
    logger.info('BIOSSES Train correlation %s', result_df.train.mean())
    logger.info('BIOSSES Test correlation %s', result_df.test.mean())
def experiment(config):
    base_dir = hydra.utils.to_absolute_path(config.data.raw_path)
    train_path = hydra.utils.to_absolute_path(config.data.train_path)
    test_path = hydra.utils.to_absolute_path(config.data.test_path)

    labels = sorted(os.listdir(base_dir))

    # Collect all files and train model on all official training files
    train_dfs = []
    test_dfs = []

    for label in labels:
        train_dfs.append(get_dataset(base_dir, label, 'train'))
        train_dfs.append(get_dataset(base_dir, label, 'devel'))
        test_dfs.append(get_dataset(base_dir, label, 'test'))

    train_dfs = pd.concat(train_dfs)
    test_dfs = pd.concat(test_dfs)
    to_txt(train_dfs, train_path)
    to_txt(test_dfs, test_path)

    model = train_model(config)

    # Train binary classifier for each label
    for label in labels:
        logger.info('Classifying: %s', label)

        train_df = get_dataset(base_dir, label, 'train')
        val_df = get_dataset(base_dir, label, 'devel')
        test_df = get_dataset(base_dir, label, 'test')

        classify(model, train_df, val_df, test_df, label)
def experiment(config):
    prepare_datasets(config)
    model = train_model(config)

    logger.info('Test set ablation study')
    logger.info('Sampling %s documents per graph category for ablation study',
                config.ablation.num_samples)
    df = sample_documents(config, model.test_df, config.ablation.num_samples)
    ablation_df = ablation_study(model, config, df)
    ablation_df.to_csv('ablation.csv')
    logger.info(ablation_df.groupby('ablation_category').mean().to_string())

    if config.ablation.distance_map:
        logger.info('Distance: Mean Average Precision: %s',
                    ablation_df['distance_map'].mean())
        logger.info('Distance: Median Mean Average Precision: %s',
                    ablation_df['distance_map'].median())
    if config.ablation.attention_map:
        logger.info('Attention: Mean Average Precision: %s',
                    ablation_df['attention_map'].median())
        logger.info('Attention: Median Mean Average Precision: %s',
                    ablation_df['attention_map'].median())

    logger.info('Test set ablation study (Unique Words)')
    logger.info('Sampling %s documents per graph category for ablation study',
                config.ablation.num_samples)
    df = sample_documents(config, model.test_df, config.ablation.num_samples)
    ablation_df = ablation_study(model, config, df, unique_tokens=True)
    ablation_df.to_csv('ablation_unique.csv')
    logger.info(ablation_df.groupby('ablation_category').mean().to_string())

    if config.ablation.distance_map:
        logger.info('Distance: Mean Average Precision: %s',
                    ablation_df['distance_map'].mean())
        logger.info('Distance: Median Mean Average Precision: %s',
                    ablation_df['distance_map'].median())
    if config.ablation.attention_map:
        logger.info('Attention: Mean Average Precision: %s',
                    ablation_df['attention_map'].median())
        logger.info('Attention: Median Mean Average Precision: %s',
                    ablation_df['attention_map'].median())
def experiment(config):
    train_df, test_df = prepare_datasets(config)
    model = train_model(config)
    classify(model, train_df, test_df)