def test_ids_to_floats(): path = os.path.join(_my_dir, 'train', 'test_input_2examples_1.jsonlines') examples = load_examples(path, ids_to_floats=True, quiet=True) assert isinstance(examples.ids[0], float) examples = load_examples(path, quiet=True) assert not isinstance(examples.ids[0], float) assert isinstance(examples.ids[0], str)
def test_backward_compatibility(): ''' Verify that a model from v0.9.17 can still be loaded and generate the same predictions. ''' predict_path = os.path.join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary_test_summary_LogisticRegression.predictions') model_path = os.path.join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary_test_summary_LogisticRegression.{}.model'.format(sys.version_info[0])) test_path = os.path.join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary.jsonlines') learner = Learner.from_file(model_path) examples = load_examples(test_path, quiet=True) new_predictions = learner.predict(examples)[:, 1] with open(predict_path) as predict_file: for line, new_val in zip(predict_file, new_predictions): assert_almost_equal(float(line.strip()), new_val)
def compute_eval_from_predictions(examples_file, predictions_file, metric_names): ''' Compute evaluation metrics from prediction files after you have run an experiment. :param examples_file: a SKLL examples file (in .jsonlines or other format) :param predictions_file: a SKLL predictions output TSV file with id and prediction column names :param metric_names: a list of SKLL metric names (e.g., [pearson, unweighted_kappa]) :returns: a dictionary from metrics names to values ''' # read gold standard labels data = load_examples(examples_file) gold = dict(zip(data.ids, data.classes)) # read predictions pred = {} with open(predictions_file) as pred_file: reader = csv.reader(pred_file, dialect=csv.excel_tab) next(reader) # skip header for row in reader: pred[row[0]] = float(row[1]) # make a sorted list of example ids in order to match up # labels and predictions if set(gold.keys()) != set(pred.keys()): raise ValueError('The example and prediction IDs do not match.') example_ids = sorted(gold.keys()) res = {} for metric_name in metric_names: score = use_score_func(metric_name, [gold[ex_id] for ex_id in example_ids], [pred[ex_id] for ex_id in example_ids]) res[metric_name] = score return res
def _load_featureset(dir_path, feat_files, suffix, label_col='y', ids_to_floats=False, quiet=False, class_map=None, feature_hasher=False, num_features=None): ''' Load a list of feature files and merge them. :param dir_path: Path to the directory that contains the feature files. :type dir_path: str :param feat_files: List of feature file prefixes :type feat_files: str :param suffix: Suffix to add to feature file prefixes to get full filenames. :type suffix: str :param label_col: Name of the column which contains the class labels. If no column with that name exists, or `None` is specified, the data is considered to be unlabelled. :type label_col: str :param ids_to_floats: Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. :type ids_to_floats: bool :param quiet: Do not print "Loading..." status message to stderr. :type quiet: bool :param class_map: Mapping from original class labels to new ones. This is mainly used for collapsing multiple classes into a single class. Anything not in the mapping will be kept the same. :type class_map: dict from str to str :returns: The classes, IDs, features, and feature vectorizer representing the given featureset. :rtype: FeatureSet ''' merged_set = FeatureSet('') for file_name in sorted(os.path.join(dir_path, featfile + suffix) for featfile in feat_files): merged_set += load_examples(file_name, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=num_features) return merged_set
def _load_featureset(dirpath, featureset, suffix, label_col='y', ids_to_floats=False, quiet=False, class_map=None, unlabelled=False): ''' Load a list of feature files and merge them. :param dirpath: Path to the directory that contains the feature files. :type dirpath: str :param featureset: List of feature file prefixes :type featureset: str :param suffix: Suffix to add to feature file prefixes to get full filenames. :type suffix: str :param label_col: Name of the column which contains the class labels. If no column with that name exists, or `None` is specified, the data is considered to be unlabelled. :type label_col: str :param ids_to_floats: Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. :type ids_to_floats: bool :param quiet: Do not print "Loading..." status message to stderr. :type quiet: bool :param class_map: Mapping from original class labels to new ones. This is mainly used for collapsing multiple classes into a single class. Anything not in the mapping will be kept the same. :type class_map: dict from str to str :param unlabelled: Is this test we're loading? If so, don't raise an error if there are no labels. :type unlabelled: bool :returns: The classes, IDs, features, and feature vectorizer representing the given featureset. :rtype: ExamplesTuple ''' # Load a list of lists of examples, one list of examples per featureset. file_names = sorted( os.path.join(dirpath, featfile + suffix) for featfile in featureset) example_tuples = [ load_examples(file_name, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) for file_name in file_names ] # Check that the IDs are unique within each file. for file_name, examples in zip(file_names, example_tuples): ex_ids = examples.ids if len(ex_ids) != len(set(ex_ids)): raise ValueError(('The example IDs are not unique in ' + '{}.').format(file_name)) # Check that the different feature files have the same IDs. # To do this, make a sorted tuple of unique IDs for each feature file, # and then make sure they are all the same by making sure the set has one # item in it. mismatch_num = len( {tuple(sorted(examples.ids)) for examples in example_tuples}) if mismatch_num != 1: raise ValueError(('The sets of example IDs in {} feature files do ' + 'not match').format(mismatch_num)) # Make sure there is a unique label for every example (or no label, for # "unseen" examples). # To do this, find the unique (id, y) tuples, and then make sure that all # those ids are unique. unique_tuples = set( chain( *[[(curr_id, curr_label) for curr_id, curr_label in zip(examples.ids, examples.classes)] for examples in example_tuples if any(x is not None for x in examples.classes)])) if len({tup[0] for tup in unique_tuples}) != len(unique_tuples): raise ValueError('At least two feature files have different labels ' + '(i.e., y values) for the same ID.') # Now, create the final ExamplesTuple of examples with merged features merged_vectorizer = None merged_features = None merged_ids = None merged_classes = None for ids, classes, features, feat_vectorizer in example_tuples: # Combine feature matrices and vectorizers if merged_features is not None: # Check for duplicate feature names if (set(merged_vectorizer.get_feature_names()) & set(feat_vectorizer.get_feature_names())): raise ValueError('Two feature files have the same feature!') num_merged = merged_features.shape[1] merged_features = sp.hstack([merged_features, features], 'csr') # dictvectorizer sorts the vocabularies within each file for feat_name, index in sorted(feat_vectorizer.vocabulary_.items(), key=lambda x: x[1]): merged_vectorizer.vocabulary_[feat_name] = index + num_merged merged_vectorizer.feature_names_.append(feat_name) else: merged_features = features merged_vectorizer = feat_vectorizer # IDs should be the same for each ExamplesTuple, so only store once if merged_ids is None: merged_ids = ids # Check that IDs are in the same order elif not np.all(merged_ids == ids): raise ValueError('IDs are not in the same order in each feature ' + 'file!') # If current ExamplesTuple has labels, check that they don't conflict if any(x is not None for x in classes): # Classes should be the same for each ExamplesTuple, so store once if merged_classes is None: merged_classes = classes # Check that classes don't conflict, when specified elif not np.all(merged_classes == classes): raise ValueError('Feature files have conflicting labels for ' + 'examples with the same ID!') # Ensure that at least one file had classes if we're expecting them if merged_classes is None and not unlabelled: raise ValueError('No feature files in feature set contain class' + 'labels!') return ExamplesTuple(merged_ids, merged_classes, merged_features, merged_vectorizer)
def _load_featureset(dirpath, featureset, suffix, label_col='y', ids_to_floats=False, quiet=False, class_map=None, unlabelled=False): ''' Load a list of feature files and merge them. :param dirpath: Path to the directory that contains the feature files. :type dirpath: str :param featureset: List of feature file prefixes :type featureset: str :param suffix: Suffix to add to feature file prefixes to get full filenames. :type suffix: str :param label_col: Name of the column which contains the class labels. If no column with that name exists, or `None` is specified, the data is considered to be unlabelled. :type label_col: str :param ids_to_floats: Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. :type ids_to_floats: bool :param quiet: Do not print "Loading..." status message to stderr. :type quiet: bool :param class_map: Mapping from original class labels to new ones. This is mainly used for collapsing multiple classes into a single class. Anything not in the mapping will be kept the same. :type class_map: dict from str to str :param unlabelled: Is this test we're loading? If so, don't raise an error if there are no labels. :type unlabelled: bool :returns: The classes, IDs, features, and feature vectorizer representing the given featureset. :rtype: ExamplesTuple ''' # Load a list of lists of examples, one list of examples per featureset. file_names = sorted(os.path.join(dirpath, featfile + suffix) for featfile in featureset) example_tuples = [load_examples(file_name, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) for file_name in file_names] # Check that the IDs are unique within each file. for file_name, examples in zip(file_names, example_tuples): ex_ids = examples.ids if len(ex_ids) != len(set(ex_ids)): raise ValueError(('The example IDs are not unique in ' + '{}.').format(file_name)) # Check that the different feature files have the same IDs. # To do this, make a sorted tuple of unique IDs for each feature file, # and then make sure they are all the same by making sure the set has one # item in it. mismatch_num = len({tuple(sorted(examples.ids)) for examples in example_tuples}) if mismatch_num != 1: raise ValueError(('The sets of example IDs in {} feature files do ' + 'not match').format(mismatch_num)) # Make sure there is a unique label for every example (or no label, for # "unseen" examples). # To do this, find the unique (id, y) tuples, and then make sure that all # those ids are unique. unique_tuples = set(chain(*[[(curr_id, curr_label) for curr_id, curr_label in zip(examples.ids, examples.classes)] for examples in example_tuples if any(x is not None for x in examples.classes)])) if len({tup[0] for tup in unique_tuples}) != len(unique_tuples): raise ValueError('At least two feature files have different labels ' + '(i.e., y values) for the same ID.') # Now, create the final ExamplesTuple of examples with merged features merged_vectorizer = None merged_features = None merged_ids = None merged_classes = None for ids, classes, features, feat_vectorizer in example_tuples: # Combine feature matrices and vectorizers if merged_features is not None: # Check for duplicate feature names if (set(merged_vectorizer.get_feature_names()) & set(feat_vectorizer.get_feature_names())): raise ValueError('Two feature files have the same feature!') num_merged = merged_features.shape[1] merged_features = sp.hstack([merged_features, features], 'csr') # dictvectorizer sorts the vocabularies within each file for feat_name, index in sorted(feat_vectorizer.vocabulary_.items(), key=lambda x: x[1]): merged_vectorizer.vocabulary_[feat_name] = index + num_merged merged_vectorizer.feature_names_.append(feat_name) else: merged_features = features merged_vectorizer = feat_vectorizer # IDs should be the same for each ExamplesTuple, so only store once if merged_ids is None: merged_ids = ids # Check that IDs are in the same order elif not np.all(merged_ids == ids): raise ValueError('IDs are not in the same order in each feature ' + 'file!') # If current ExamplesTuple has labels, check that they don't conflict if any(x is not None for x in classes): # Classes should be the same for each ExamplesTuple, so store once if merged_classes is None: merged_classes = classes # Check that classes don't conflict, when specified elif not np.all(merged_classes == classes): raise ValueError('Feature files have conflicting labels for ' + 'examples with the same ID!') # Ensure that at least one file had classes if we're expecting them if merged_classes is None and not unlabelled: raise ValueError('No feature files in feature set contain class' + 'labels!') return ExamplesTuple(merged_ids, merged_classes, merged_features, merged_vectorizer)