Ejemplo n.º 1
0
def test_ids_to_floats():
    path = join(_my_dir, 'train', 'test_input_2examples_1.jsonlines')

    examples = Reader.for_path(path, ids_to_floats=True, quiet=True).read()
    assert isinstance(examples.ids[0], float)

    examples = Reader.for_path(path, quiet=True).read()
    assert not isinstance(examples.ids[0], float)
    assert isinstance(examples.ids[0], str)
Ejemplo n.º 2
0
def test_ids_to_floats():
    path = join(_my_dir, 'train', 'test_input_2examples_1.jsonlines')

    examples = Reader.for_path(path, ids_to_floats=True, quiet=True).read()
    assert isinstance(examples.ids[0], float)

    examples = Reader.for_path(path, quiet=True).read()
    assert not isinstance(examples.ids[0], float)
    assert isinstance(examples.ids[0], str)
Ejemplo n.º 3
0
def test_backward_compatibility():
    """
    Test to validate backward compatibility
    """
    predict_path = join(_my_dir, 'backward_compatibility',
                        ('v0.9.17_test_summary_test_summary_'
                         'LogisticRegression.predictions'))
    model_path = join(_my_dir, 'backward_compatibility',
                      ('v0.9.17_test_summary_test_summary_LogisticRegression.'
                       '{}.model').format(sys.version_info[0]))
    test_path = join(_my_dir, 'backward_compatibility',
                     'v0.9.17_test_summary.jsonlines')

    learner = Learner.from_file(model_path)
    examples = Reader.for_path(test_path, quiet=True).read()
    new_predictions = learner.predict(examples)[:, 1]

    with open(predict_path) as predict_file:
        old_predictions = [float(line.strip()) for line in predict_file]
    assert_almost_equal(new_predictions, old_predictions)
Ejemplo n.º 4
0
def test_backward_compatibility():
    """
    Test to validate backward compatibility
    """
    predict_path = join(_my_dir, 'backward_compatibility',
                        ('v0.9.17_test_summary_test_summary_'
                         'LogisticRegression.predictions'))
    model_path = join(_my_dir, 'backward_compatibility',
                      ('v0.9.17_test_summary_test_summary_LogisticRegression.'
                       '{}.model').format(sys.version_info[0]))
    test_path = join(_my_dir, 'backward_compatibility',
                     'v0.9.17_test_summary.jsonlines')

    learner = Learner.from_file(model_path)
    examples = Reader.for_path(test_path, quiet=True).read()
    new_predictions = learner.predict(examples)[:, 1]

    with open(predict_path) as predict_file:
        old_predictions = [float(line.strip()) for
                           line in predict_file]
    assert_almost_equal(new_predictions, old_predictions)
Ejemplo n.º 5
0
def compute_eval_from_predictions(examples_file, predictions_file,
                                  metric_names):
    """
    Compute evaluation metrics from prediction files after you have run an
    experiment.

    :param examples_file: a SKLL examples file (in .jsonlines or other format)
    :param predictions_file: a SKLL predictions output TSV file with id
                             and prediction column names
    :param metric_names: a list of SKLL metric names
                         (e.g., [pearson, unweighted_kappa])

    :returns: a dictionary from metrics names to values
    """

    # read gold standard labels
    data = Reader.for_path(examples_file).read()
    gold = dict(zip(data.ids, data.labels))

    # read predictions
    pred = {}
    with open(predictions_file) as pred_file:
        reader = csv.reader(pred_file, dialect=csv.excel_tab)
        next(reader)  # skip header
        for row in reader:
            pred[row[0]] = safe_float(row[1])

    # make a sorted list of example ids in order to match up
    # labels and predictions
    if set(gold.keys()) != set(pred.keys()):
        raise ValueError('The example and prediction IDs do not match.')
    example_ids = sorted(gold.keys())

    res = {}
    for metric_name in metric_names:
        score = use_score_func(metric_name,
                               [gold[ex_id] for ex_id in example_ids],
                               [pred[ex_id] for ex_id in example_ids])
        res[metric_name] = score
    return res
def compute_eval_from_predictions(examples_file, predictions_file, metric_names):
    """
    Compute evaluation metrics from prediction files after you have run an
    experiment.

    :param examples_file: a SKLL examples file (in .jsonlines or other format)
    :param predictions_file: a SKLL predictions output TSV file with id
                             and prediction column names
    :param metric_names: a list of SKLL metric names
                         (e.g., [pearson, unweighted_kappa])

    :returns: a dictionary from metrics names to values
    """

    # read gold standard labels
    data = Reader.for_path(examples_file).read()
    gold = dict(zip(data.ids, data.labels))

    # read predictions
    pred = {}
    with open(predictions_file) as pred_file:
        reader = csv.reader(pred_file, dialect=csv.excel_tab)
        next(reader)  # skip header
        for row in reader:
            pred[row[0]] = safe_float(row[1])

    # make a sorted list of example ids in order to match up
    # labels and predictions
    if set(gold.keys()) != set(pred.keys()):
        raise ValueError("The example and prediction IDs do not match.")
    example_ids = sorted(gold.keys())

    res = {}
    for metric_name in metric_names:
        score = use_score_func(
            metric_name, [gold[ex_id] for ex_id in example_ids], [pred[ex_id] for ex_id in example_ids]
        )
        res[metric_name] = score
    return res
Ejemplo n.º 7
0
def main():
    """
    Create directories and split CSV files into subsets.
    """
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Create dictionary of subsets to use for creating split feature files
    subset_dict = {
        'vitals': ['Sex', 'Age'],
        'socioeconomic': ['Pclass', 'Fare'],
        'family': ['SibSp', 'Parch'],
        'misc': ['Embarked']
    }
    features_to_keep = list(chain(*subset_dict.values()))

    # Create directories to store files
    if not os.path.exists('titanic/train'):
        logger.info('Creating titanic/train directory')
        os.makedirs('titanic/train')
    if not os.path.exists('titanic/dev'):
        logger.info('Creating titanic/dev directory')
        os.makedirs('titanic/dev')
    if not os.path.exists('titanic/train+dev'):
        logger.info('Creating titanic/train+dev directory')
        os.makedirs('titanic/train+dev')
    if not os.path.exists('titanic/test'):
        logger.info('Creating titanic/test directory')
        os.makedirs('titanic/test')

    usecols_train = features_to_keep + ['PassengerId', 'Survived']
    usecols_test = features_to_keep + ['PassengerId']

    # Read and write training FeatureSet
    train_fs = Reader.for_path('titanic/train.csv',
                               label_col='Survived',
                               id_col='PassengerId',
                               drop_blanks=True,
                               pandas_kwargs={
                                   'usecols': usecols_train
                               },
                               quiet=False,
                               sparse=False).read()

    train_fs.filter(features=features_to_keep)
    num_train_dev = len(train_fs)
    num_train = int((num_train_dev / 5) * 4)
    writer = Writer.for_path('titanic/train/.csv',
                             train_fs[:num_train],
                             id_col='PassengerId',
                             label_col='Survived',
                             quiet=False,
                             subsets=subset_dict)
    writer.write()

    # Write train+dev set for training model to use to generate predictions on
    # test
    writer = Writer.for_path('titanic/train+dev/.csv',
                             train_fs,
                             label_col='Survived',
                             id_col='PassengerId',
                             quiet=False,
                             subsets=subset_dict)
    writer.write()

    # Write dev FeatureSet
    writer = Writer.for_path('titanic/dev/.csv',
                             train_fs[num_train:],
                             label_col='Survived',
                             id_col='PassengerId',
                             quiet=False,
                             subsets=subset_dict)
    writer.write()

    # Read and write test FeatureSet
    test_fs = Reader.for_path('titanic/test.csv',
                              label_col='Survived',
                              drop_blanks=True,
                              pandas_kwargs={
                                  'usecols': usecols_test
                              },
                              quiet=False,
                              sparse=False).read()

    test_fs.filter(features=features_to_keep)
    num_test = len(test_fs)
    test_fs.ids = list(range(num_train_dev + 1, num_test + num_train_dev + 1))
    writer = Writer.for_path('titanic/test/.csv',
                             test_fs,
                             id_col='PassengerId',
                             quiet=False,
                             subsets=subset_dict)
    writer.write()
Ejemplo n.º 8
0
def compute_eval_from_predictions(examples_file,
                                  predictions_file,
                                  metric_names,
                                  prediction_method=None):
    """
    Compute evaluation metrics from prediction files after you have run an
    experiment.

    Parameters
    ----------
    examples_file: str
        Path to a SKLL examples file (in .jsonlines or other format).
    predictions_file: str
        Path to a SKLL predictions output TSV file with id and prediction column names.
    metric_names: list of str
        A list of SKLL metric names (e.g., [pearson, unweighted_kappa]).
    prediction_method: str or None
        Indicates how to get a single class prediction from the probabilities. Currently
        supported options are  "highest", which selects the class with the highest
        probability, and "expected_value", which calculates an expected value over
        integer classes and rounds to the nearest int. If predictions file does not
        contain probabilities, this should be set to None.

    Returns
    -------
    dict
        Maps metrics names to corresponding values.

    Raises
    ------
    ValueError
        If the requested prediction method is 'expected_value' but the class names can't
        be converted to ints.
    """

    # read gold standard labels
    data = Reader.for_path(examples_file).read()
    gold = dict(zip(data.ids, data.labels))

    # read predictions
    pred = {}
    with open(predictions_file) as pred_file:
        reader = csv.reader(pred_file, dialect=csv.excel_tab)
        header = next(reader)

        # If there are more than two columns, assume column 0 contains the ids, and
        # columns 1-n contain class probabilities. Convert them to a class prediction
        # using the specified `method`.
        if len(header) > 2:
            classes = [c for c in header[1:] if c]
            if prediction_method is None:
                prediction_method = "highest"
                logger.info("No prediction method specified. Using 'highest'.")
            if prediction_method == 'expected_value':
                try:
                    classes = [int(c) for c in classes]
                except ValueError as e:
                    raise e
            for row in reader:
                probabilities = [safe_float(p) for p in row[1:]]
                prediction = get_prediction_from_probabilities(
                    classes, probabilities, prediction_method)
                pred[row[0]] = safe_float(prediction)
        else:
            if prediction_method is not None:
                logger.warning(
                    "A prediction method was provided, but the predictions "
                    "file doesn't contain probabilities. Ignoring prediction "
                    "method '{}'.".format(prediction_method))

            for row in reader:
                pred[row[0]] = safe_float(row[1])

    # make a sorted list of example ids in order to match up
    # labels and predictions
    if set(gold.keys()) != set(pred.keys()):
        raise ValueError('The example and prediction IDs do not match.')
    example_ids = sorted(gold.keys())

    res = {}
    for metric_name in metric_names:
        score = use_score_func(metric_name,
                               [gold[ex_id] for ex_id in example_ids],
                               [pred[ex_id] for ex_id in example_ids])
        res[metric_name] = score
    return res