def test_custom_metric_api_experiment(): """Test API with custom metrics""" # register two different metrics from two files input_dir = join(_my_dir, "other") custom_metrics_file1 = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file1, "f075_macro") custom_metrics_file2 = join(input_dir, "custom_metrics2.py") register_custom_metric(custom_metrics_file2, "f06_micro") # read in some train/test data train_file = join(input_dir, "examples_train.jsonlines") test_file = join(input_dir, "examples_test.jsonlines") train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file).read() # set up a learner to tune using one of the custom metrics # and evaluate it using the other one learner = Learner("LogisticRegression") _ = learner.train(train_fs, grid_objective="f075_macro") results = learner.evaluate( test_fs, grid_objective="f075_macro", output_metrics=["balanced_accuracy", "f06_micro"]) test_objective_value = results[-2] test_output_metrics_dict = results[-1] test_accuracy_value = test_output_metrics_dict["balanced_accuracy"] test_f06_micro_value = test_output_metrics_dict["f06_micro"] # check that the values are as expected assert_almost_equal(test_objective_value, 0.9785, places=4) assert_almost_equal(test_accuracy_value, 0.9792, places=4) assert_almost_equal(test_f06_micro_value, 0.98, places=4)
def test_api_with_custom_prob_metric(): """Test API with custom probabilistic metric""" # register a custom metric from our file that requires probabilities input_dir = join(_my_dir, "other") custom_metrics_file = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "fake_prob_metric") # create some classification data train_fs, _ = make_classification_data(num_examples=1000, num_features=10, num_labels=2) # set up a learner to tune using this probabilistic metric # this should fail since LinearSVC doesn't support probabilities learner1 = Learner("LinearSVC") assert_raises_regex(AttributeError, r"has no attribute 'predict_proba'", learner1.train, train_fs, grid_objective="fake_prob_metric") # set up another learner with explicit probability support # this should work just fine with our custom metric learner2 = Learner("SVC", probability=True) grid_score, _ = learner2.train(train_fs, grid_objective="fake_prob_metric") ok_(grid_score > 0.95)
def test_custom_metric_api_experiment_with_kappa_filename(): """Test API with metric defined in a file named kappa""" # register a dummy metric that just returns 1 from # a file called 'kappa.py' input_dir = join(_my_dir, "other") custom_metrics_file = join(input_dir, "kappa.py") register_custom_metric(custom_metrics_file, "dummy_metric") # read in some train/test data train_file = join(input_dir, "examples_train.jsonlines") test_file = join(input_dir, "examples_test.jsonlines") train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file).read() # set up a learner to tune using our usual kappa metric # and evaluate it using the dummy metric we loaded # this should work as there should be no confict between # the two "kappa" names learner = Learner("LogisticRegression") _ = learner.train(train_fs, grid_objective="unweighted_kappa") results = learner.evaluate( test_fs, grid_objective="unweighted_kappa", output_metrics=["balanced_accuracy", "dummy_metric"]) test_objective_value = results[-2] test_output_metrics_dict = results[-1] test_accuracy_value = test_output_metrics_dict["balanced_accuracy"] test_dummy_metric_value = test_output_metrics_dict["dummy_metric"] # check that the values are as expected assert_almost_equal(test_objective_value, 0.9699, places=4) assert_almost_equal(test_accuracy_value, 0.9792, places=4) eq_(test_dummy_metric_value, 1.0)
def create_fake_skll_learner(df_coefficients): """ Create fake SKLL linear regression learner object using the coefficients in the given data frame. Parameters ---------- df_coefficients : pandas DataFrame Data frame containing the linear coefficients we want to create the fake SKLL model with. Returns ------- learner: skll Learner object SKLL LinearRegression Learner object containing with the specified coefficients. """ # get the logger logger = logging.getLogger(__name__) # initialize a random number generator randgen = RandomState(1234567890) # iterate over the coefficients coefdict = {} for feature, coefficient in df_coefficients.itertuples(index=False): if feature == 'Intercept': intercept = coefficient else: # exclude NA coefficients if coefficient == np.nan: logger.warning("No coefficient was estimated for " "{}. This is likely due to exact " "collinearity in the model. This " "feature will not be used for model " "building".format(feature)) else: coefdict[feature] = coefficient learner = Learner('LinearRegression') num_features = len(coefdict) # excluding the intercept fake_feature_values = randgen.rand(num_features) fake_features = [dict(zip(coefdict, fake_feature_values))] fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features) learner.train(fake_fs, grid_search=False) # now create its parameters from the coefficients from the built-in model learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0] learner.model.intercept_ = intercept return learner
def main(): ''' Handles command line arguments and gets things started. ''' parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') parser.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args() # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) weights = learner.model_params print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def main(): ''' Handles command line arguments and gets things started. ''' parser = argparse.ArgumentParser( description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') parser.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args() # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) weights = learner.model_params print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def main(argv=None): """ Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str """ parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') parser.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) parser.add_argument('--sign', choices=['positive', 'negative', 'all'], default='all', help='show only positive, only negative, ' + 'or all weights') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params weight_items = iteritems(weights) if args.sign == 'positive': weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == 'negative': weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if '_intercept_' in intercept: # Some learners (e.g. LinearSVR) may return a list of intercepts if isinstance(intercept['_intercept_'], np.ndarray): intercept_list = ["%.12f" % i for i in intercept['_intercept_']] print("intercept = {}".format(intercept_list)) else: print("intercept = {:.12f}".format(intercept['_intercept_'])) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{:.12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def main(argv=None): """ Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str """ parser = argparse.ArgumentParser( description="Prints out the weights of a \ given model.", conflict_handler="resolve", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("model_file", help="model file to load") parser.add_argument("--k", help="number of top features to print (0 for all)", type=int, default=50) parser.add_argument( "--sign", choices=["positive", "negative", "all"], default="all", help="show only positive, only negative, " + "or all weights", ) parser.add_argument("--version", action="version", version="%(prog)s {0}".format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=("%(asctime)s - %(name)s - %(levelname)s - " + "%(message)s")) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params weight_items = iteritems(weights) if args.sign == "positive": weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == "negative": weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if "_intercept_" in intercept: # Some learners (e.g. LinearSVR) may return a list of intercepts if isinstance(intercept["_intercept_"], np.ndarray): intercept_list = ["%.12f" % i for i in intercept["_intercept_"]] print("intercept = {}".format(intercept_list)) else: print("intercept = {:.12f}".format(intercept["_intercept_"])) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{:.12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def setUpClass(cls): # create a dummy train and test feature set X, y = make_classification(n_samples=525, n_features=10, n_classes=5, n_informative=8, random_state=123) X_train, y_train = X[:500], y[:500] X_test = X[500:] train_ids = list(range(1, len(X_train) + 1)) train_features = [ dict( zip([ 'FEATURE_{}'.format(i + 1) for i in range(X_train.shape[1]) ], x)) for x in X_train ] train_labels = list(y_train) test_ids = list(range(1, len(X_test) + 1)) test_features = [ dict( zip([ 'FEATURE_{}'.format(i + 1) for i in range(X_test.shape[1]) ], x)) for x in X_test ] cls.train_fs = FeatureSet('train', ids=train_ids, features=train_features, labels=train_labels) cls.test_fs = FeatureSet('test', ids=test_ids, features=test_features) # train some test SKLL learners that we will use in our tests cls.linearsvc = Learner('LinearSVC') _ = cls.linearsvc.train(cls.train_fs, grid_search=False) cls.svc = Learner('SVC') _ = cls.svc.train(cls.train_fs, grid_search=False) cls.svc_with_probs = Learner('SVC', probability=True) _ = cls.svc_with_probs.train(cls.train_fs, grid_search=False)
def main(argv=None): """ Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str """ parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') parser.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) parser.add_argument('--sign', choices=['positive', 'negative', 'all'], default='all', help='show only positive, only negative, ' + 'or all weights') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params weight_items = iteritems(weights) if args.sign == 'positive': weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == 'negative': weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if '_intercept_' in intercept: print("intercept = {:.12f}".format(intercept['_intercept_'])) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{:.12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def train_skll_model(model_name, df_train, experiment_id, csvdir, figdir): # instantiate the given SKLL learner learner = Learner(model_name) # get the features, IDs, and labels from the given data frame feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']] features = df_train[feature_columns].to_dict(orient='records') ids = df_train['spkitemid'].tolist() labels = df_train['sc1'].tolist() # create a FeatureSet and train the model fs = FeatureSet('train', ids=ids, labels=labels, features=features) # if it's a regression model, then our grid objective should be # pearson and otherwise it should be accuracy if model_name in ["AdaBoostRegressor", "DecisionTreeRegressor", "ElasticNet", "GradientBoostingRegressor", "KNeighborsRegressor", "Lasso", "LinearRegression", "RandomForestRegressor", "Ridge", "SGDRegressor", "LinearSVR", "SVR"]: objective = 'pearson' else: objective = 'f1_score_micro' learner.train(fs, grid_search=True, grid_objective=objective, grid_jobs=1) # TODO: compute betas for linear SKLL models? # save the SKLL model to disk with the given model name prefix model_file = join(csvdir, '{}.model'.format(experiment_id)) learner.save(model_file) # return the SKLL learner object return learner
def create_fake_skll_learner(df_coefficients): """ Create fake SKLL linear regression learner object using the coefficients in the given data frame. """ # get the logger logger = logging.getLogger(__name__) # initialize a random number generator randgen = RandomState(1234567890) # iterate over the coefficients coefdict = {} for feature, coefficient in df_coefficients.itertuples(index=False): if feature == 'Intercept': intercept = coefficient else: # exclude NA coefficients if coefficient == np.nan: logger.warning("No coefficient was estimated for " "{}. This is likely due to exact " "collinearity in the model. This " "feature will not be used for model " "building".format(feature)) else: coefdict[feature] = coefficient learner = Learner('LinearRegression') num_features = len(coefdict) # excluding the intercept fake_feature_values = randgen.rand(num_features) fake_features = [dict(zip(coefdict, fake_feature_values))] fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features) learner.train(fake_fs, grid_search=False) # now create its parameters from the coefficients from the built-in model learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0] learner.model.intercept_ = intercept return learner
def update_model(model_file): """Read in the model file and save it again.""" model_dir = dirname(model_file) # get the list of current files so that we can # remove them later to ensure there are no stranded # .npy files npy_files = glob.glob(join(model_dir, '*.npy')) # now load the SKLL model model = Learner.from_file(model_file) # delete the existing npy files. The model file will get overwritten, # but we do not know the exact number of current .npy files. for npy_file in npy_files: remove(npy_file) model.save(model_file)
def update_model(model_file): ''' Read in the model file and save it again''' model_dir = dirname(model_file) # get the list of current files so that we can # remove them later to ensure there are no stranded # .npy files npy_files = glob.glob(join(model_dir, '*.npy')) # now load the SKLL model model = Learner.from_file(model_file) # delete the existing npy files. The model file will get overwritten, # but we do not know the exact number of current .npy files. for npy_file in npy_files: remove(npy_file) model.save(model_file)
def __init__(self, model_path, threshold=None, positive_class=1): ''' Initialize the predictor. :param model_path: Path to use when loading trained model. :type model_path: str :param threshold: If the model we're using is generating probabilities of the positive class, return 1 if it meets/exceeds the given threshold and 0 otherwise. :type threshold: float :param positive_class: If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. :type positive_class: int ''' self._learner = Learner.from_file(model_path) self._pos_index = positive_class self.threshold = threshold
def test_api_with_inverted_custom_metric(): """Test API with a lower-is-better custom metric""" # register a lower-is-better custom metrics from our file # which is simply 1 minus the precision score input_dir = join(_my_dir, "other") custom_metrics_file1 = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file1, "one_minus_precision") # create some classification data train_fs, _ = make_classification_data(num_examples=1000, num_features=10, num_labels=2) # set up a learner to tune using the lower-is-better custom metric learner1 = Learner("LogisticRegression") (grid_score1, grid_results_dict1) = learner1.train(train_fs, grid_objective="one_minus_precision") # now setup another learner that uses the complementary version # of our custom metric (regular precision) for grid search learner2 = Learner("LogisticRegression") (grid_score2, grid_results_dict2) = learner2.train(train_fs, grid_objective="precision") # for both learners the ranking of the C hyperparameter should be # should be the identical since when we defined one_minus_precision # we set the `greater_is_better` keyword argument to `False` assert_array_equal(grid_results_dict1['rank_test_score'], grid_results_dict2['rank_test_score']) # furthermore, the final grid score and the mean scores for each # C hyperparameter value should follow the same 1-X relationship # except that our custom metric should be negated due to the # keyword argument that we set when we defined it assert_almost_equal(1 - grid_score2, -1 * grid_score1, places=6) assert_array_almost_equal(1 - grid_results_dict2['mean_test_score'], -1 * grid_results_dict1['mean_test_score'], decimal=6)
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') group = parser.add_mutually_exclusive_group() group.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) group.add_argument("--sort_by_labels", '-s', action='store_true', default=False, help="order the features by classes") parser.add_argument('--sign', choices=['positive', 'negative', 'all'], default='all', help='show only positive, only negative, ' + 'or all weights') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params multiclass = False model = learner._model if (isinstance(model, LinearSVC) or (isinstance(model, LogisticRegression) and len(learner.label_list) > 2) or (isinstance(model, SVC) and model.kernel == 'linear')): multiclass = True weight_items = iteritems(weights) if args.sign == 'positive': weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == 'negative': weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if '_intercept_' in intercept: # Some learners (e.g. LinearSVR) may return an array of intercepts but # sometimes that array is of length 1 so we don't need to print that # as an array/list. First, let's normalize these cases. model_intercepts = intercept['_intercept_'] intercept_is_array = isinstance(model_intercepts, np.ndarray) num_intercepts = len(model_intercepts) if intercept_is_array else 1 if intercept_is_array and num_intercepts == 1: model_intercepts = model_intercepts[0] intercept_is_array = False # now print out the intercepts print("intercept = {:.12f}".format(model_intercepts)) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{: .12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) weight_by_class = defaultdict(dict) if multiclass and args.sort_by_labels: for label_feature, weight in weight_items: label, feature = label_feature.split() weight_by_class[label][feature] = weight for label in sorted(weight_by_class): for feat, val in sorted(weight_by_class[label].items(), key=lambda x: -abs(x[1])): print("{: .12f}\t{}\t{}".format(val, label, feat)) else: for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{: .12f}\t{}".format(val, feat))
def compute_and_save_predictions(config_file, output_file, feats_file): """ Generate predictions using the information in the config file and save them into the given output file. """ logger = logging.getLogger(__name__) # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmpredict') # get the directory where the config file lives # if this is the 'expm' directory, then go # up one level. configpath = dirname(config_file) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = locate_file(config_obj['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist'.format(config_obj['input_features_file'])) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the column name that will hold the ID id_column = config_obj['id_column'] # get the column name for human score (if any) human_score_column = config_obj['human_score_column'] # get the column name for second human score (if any) second_human_score_column = config_obj['second_human_score_column'] # get the column name for subgroups (if any) subgroups = config_obj['subgroups'] # get the column names for flag columns (if any) flag_column_dict = check_flag_column(config_obj) # get the name for the candidate_column (if any) candidate_column = config_obj['candidate_column'] # get the directory of the experiment experiment_dir = locate_file(config_obj['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.'.format(config_obj['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError('The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError('The directory {} does not contain any rsmtool models.'.format(experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError('{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format(experiment_output_dir, expected_file_name)) # read in the given features but make sure that the # `id_column`, `candidate_column` and subgroups are read in as a string logger.info('Reading features from {}'.format(input_features_file)) string_columns = [id_column, candidate_column] + subgroups converter_dict = dict([(column, str) for column in string_columns if column]) df_input = pd.read_csv(input_features_file, converters=converter_dict) # make sure that the columns specified in the config file actually exist columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys()) # add subgroups and the flag columns to the list of columns # that will be added to the final file columns_to_copy = subgroups + list(flag_column_dict.keys()) # human_score_column will be set to sc1 by default # we only raise an error if it's set to something else. # However, since we cannot distinguish whether the column was set # to sc1 by default or specified as such in the config file # we append it to output anyway as long as # it is in the input file if human_score_column != 'sc1' or 'sc1' in df_input.columns: columns_to_check.append(human_score_column) columns_to_copy.append('sc1') if candidate_column: columns_to_check.append(candidate_column) columns_to_copy.append('candidate') if second_human_score_column: columns_to_check.append(second_human_score_column) columns_to_copy.append('sc2') missing_columns = set(columns_to_check).difference(df_input.columns) if missing_columns: raise KeyError("Columns {} from the config file " "do not exist in the data.".format(missing_columns)) # rename all columns df_input = rename_default_columns(df_input, [], id_column, human_score_column, second_human_score_column, None, None, candidate_column=candidate_column) # check that the id_column contains unique values if df_input['spkitemid'].size != df_input['spkitemid'].unique().size: raise ValueError("The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool.".format(id_column)) # now we need to pre-process these features using # the parameters that are already stored in the # _features.csv file. df_feature_info = pd.read_csv(join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)), index_col=0) required_features = df_feature_info.index.tolist() # ensure that all the features that are needed by the model # are present in the input file input_feature_columns = [c for c in df_input if c != id_column] missing_features = set(required_features).difference(input_feature_columns) if missing_features: raise KeyError('{} is missing the following features: {}'.format(feats_file, missing_features)) extra_features = set(input_feature_columns).difference(required_features + [id_column]) if extra_features: logging.warning('The following extraenous features will be ignored: {}'.format(extra_features)) # keep the required features plus the id features_to_keep = ['spkitemid'] + required_features # check if actually have the human scores for this data and add # sc1 to preprocessed features for consistency with other tools has_human_scores = 'sc1' in df_input if has_human_scores: features_to_keep.append('sc1') df_features = df_input[features_to_keep] # preprocess the feature values logger.info('Pre-processing input features') # first we need to filter out NaNs and any other # weird features, the same way we did for rsmtool. df_filtered = df_features.copy() df_excluded = pd.DataFrame(columns=df_filtered.columns) for feature_name in required_features: newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid', exclude_zeros=False, exclude_zero_sd=False) del df_filtered df_filtered = newdf df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer') # make sure that the remaining data frame is not empty if len(df_filtered) == 0: raise ValueError("There are no responses left after " "filtering out non-numeric feature values. No analysis " "will be run") df_features = df_filtered.copy() df_features_preprocessed = df_features.copy() for feature_name in required_features: feature_values = df_features[feature_name].values feature_transformation = df_feature_info.loc[feature_name]['transform'] feature_weight = df_feature_info.loc[feature_name]['sign'] train_feature_mean = df_feature_info.loc[feature_name]['train_mean'] train_feature_sd = df_feature_info.loc[feature_name]['train_sd'] train_transformed_mean = df_feature_info.loc[feature_name]['train_transformed_mean'] train_transformed_sd = df_feature_info.loc[feature_name]['train_transformed_sd'] # transform the feature values and remove outliers df_features_preprocessed[feature_name] = preprocess_feature(feature_values, feature_name, feature_transformation, train_feature_mean, train_feature_sd, exclude_zero_sd=False) # now standardize the feature values df_features_preprocessed[feature_name] = (df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd # Multiply features by weight. Within the # current SR timeline, the mean of the transformed train # feature used to standardize test features has to be # computed before multiplying the train feature by the weight. df_features_preprocessed[feature_name] = df_features_preprocessed[feature_name] * feature_weight # save the pre-processed features to disk if we were asked to if feats_file: logger.info('Saving pre-processed feature values to {}'.format(feats_file)) # create any directories needed for the output file os.makedirs(dirname(feats_file), exist_ok=True) df_features_preprocessed.to_csv(feats_file, index=False) # now load the SKLL model to generate the predictions model = Learner.from_file(join(experiment_output_dir, '{}.model'.format(experiment_id))) # now generate the predictions for the features using this model logger.info('Generating predictions') df_predictions = predict_with_model(model, df_features_preprocessed) # read in the post-processing parameters from disk df_postproc_params = pd.read_csv(join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id))) trim_min = df_postproc_params['trim_min'].values[0] trim_max = df_postproc_params['trim_max'].values[0] h1_mean = df_postproc_params['h1_mean'].values[0] h1_sd = df_postproc_params['h1_sd'].values[0] train_predictions_mean = df_postproc_params['train_predictions_mean'].values[0] train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0] # now scale the predictions logger.info('Rescaling predictions') scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd scaled_predictions = scaled_predictions * h1_sd + h1_mean df_predictions['scale'] = scaled_predictions # trim and round the predictions logger.info('Trimming and rounding predictions') df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max) df_predictions['raw_trim_round'] = np.rint(df_predictions['raw_trim']).astype('int64') df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max) df_predictions['scale_trim_round'] = np.rint(df_predictions['scale_trim']).astype('int64') # add back the columns that we were requested to copy if any if columns_to_copy: df_predictions_with_metadata = pd.merge(df_predictions, df_input[['spkitemid'] + columns_to_copy]) assert(len(df_predictions) == len(df_predictions_with_metadata)) else: df_predictions_with_metadata = df_predictions.copy() # create any directories needed for the output file os.makedirs(dirname(output_file), exist_ok=True) # save the predictions to disk logger.info('Saving predictions to {}'.format(output_file)) df_predictions_with_metadata.to_csv(output_file, index=False) # save excluded responses to disk if not df_excluded.empty: excluded_output_file = '{}_excluded_responses{}'.format(*splitext(output_file)) logger.info('Saving excluded responses to {}'.format(excluded_output_file)) df_excluded.to_csv(excluded_output_file, index=False)
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir): """ Train one of the built-in linear regression models. Parameters ---------- model_name : str Name of the built-in model to train. df_train : pandas DataFrame Data frame containing the features on which to train the model. experiment_id : str The experiment ID. csvdir : str Path to the `output` experiment output directory. figdir : str Path to the `figure` experiment output directory. Returns ------- learner : skll Learner object SKLL LinearRegression Learner object containing the coefficients learned by training the built-in model. """ # get the columns that actually contain the feature values feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']] # LinearRegression (formerly empWt) : simple linear regression if model_name == 'LinearRegression': # get the feature columns X = df_train[feature_columns] # add the intercept X = sm.add_constant(X) # fit the model fit = sm.OLS(df_train['sc1'], X).fit() df_coef = ols_coefficients_to_dataframe(fit.params) learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # EqualWeightsLR (formerly eqWt) : all features get equal weight elif model_name == 'EqualWeightsLR': # we first compute a single feature that is simply the sum of all features df_train_eqwt = df_train.copy() df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1) # train a plain Linear Regression model X = df_train_eqwt['sumfeature'] X = sm.add_constant(X) fit = sm.OLS(df_train_eqwt['sc1'], X).fit() # get the coefficient for the summed feature and the intercept coef = fit.params['sumfeature'] const = fit.params['const'] # now we need to assign this coefficient to all of the original # features and create a fake SKLL learner with these weights original_features = [c for c in df_train_eqwt.columns if c not in ['sc1', 'sumfeature', 'spkitemid']] coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)])) df_coef = ols_coefficients_to_dataframe(coefs) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # RebalancedLR (formerly empWtBalanced) : balanced empirical weights # by changing betas [adapted from http://bit.ly/UTP7gS] elif model_name == 'RebalancedLR': # train a plain Linear Regression model X = df_train[feature_columns] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_params = ols_coefficients_to_dataframe(fit.params) df_params = df_params.set_index('feature') # compute the betas for the non-intercept coefficients df_weights = df_params.loc[feature_columns] df_betas = df_weights.copy() df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std() # replace each negative beta with delta and adjust # all the positive betas to account for this RT = 0.05 df_positive_betas = df_betas[df_betas['coefficient'] > 0] df_negative_betas = df_betas[df_betas['coefficient'] < 0] delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas) df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1) # rescale the adjusted betas to get the new coefficients df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index') # add the intercept back to the new coefficients df_coef['Intercept'] = df_params.loc['Intercept'].coefficient df_coef = df_coef.sort_index().reset_index() df_coef.columns = ['feature', 'coefficient'] # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature # selection using lasso regression with a fixed lambda and then # use only those features to train a second linear regression elif model_name == 'LassoFixedLambdaThenLR': # train a Lasso Regression model with this featureset with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train a new vanilla linear regression with just the non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # get the coefficients data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature # selection using lasso regression optimized for log likelihood using # cross validation and then use only those features to train a # second linear regression elif model_name == 'PositiveLassoCVThenLR': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # NNLR (formerly empWtNNLS) : First do feature selection using # non-negative least squares (NNLS) and then use only its non-zero # features to train a regular linear regression. We do the regular # LR at the end since we want an LR object so that we have access # to R^2 and other useful statistics. There should be no difference # between the non-zero coefficients from NNLS and the coefficients # that end up coming out of the subsequent LR. elif model_name == 'NNLR': # add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters to a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # LassoFixedLambdaThenNNLR (formerly empWtDropNegLasso): First do # feature selection using lasso regression and positive only weights. # Then fit an NNLR (see above) on those features. elif model_name == 'LassoFixedLambdaThenNNLR': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train an NNLS regression using these non-zero features # first add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the positive features used_features = non_zero_features # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with # a fixed lambda elif model_name == 'LassoFixedLambda': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation alpha = p_lambda / len(df_train) learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True}) learner.train(fs_train, grid_search=False) # convert this model's parameters to a data frame df_coef = skll_learner_params_to_dataframe(learner) # there's no OLS fit object in this case fit = None # we used all the features used_features = feature_columns # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection # using lasso regression optimized for log likelihood using cross # validation. elif model_name == 'PositiveLassoCV': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # save the non-zero model coefficients and intercept to a data frame non_zero_features, non_zero_feature_values = [], [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) non_zero_feature_values.append(coefficient) # initialize the coefficient data frame with just the intercept df_coef = pd.DataFrame([('Intercept', model.intercept_)]) df_coef = df_coef.append(list(zip(non_zero_features, non_zero_feature_values)), ignore_index=True) df_coef.columns = ['feature', 'coefficient'] # create a fake SKLL learner with these non-zero weights learner = create_fake_skll_learner(df_coef) # there's no OLS fit object in this case fit = None # we used only the non-zero features used_features = non_zero_features # save the raw coefficients to a file df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False) # compute the standardized and relative coefficients (betas) for the # non-intercept features and save to a file df_betas = df_coef.set_index('feature').loc[used_features] df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std() df_betas.columns = ['standardized'] df_betas['relative'] = df_betas / sum(abs(df_betas['standardized'])) df_betas.reset_index(inplace=True) df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False) # save the OLS fit object and its summary to files if fit: ols_file = join(csvdir, '{}.ols'.format(experiment_id)) summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id)) with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf: pickle.dump(fit, olsf) summf.write(str(fit.summary())) # create a data frame with main model fit metrics and save to the file df_model_fit = model_fit_to_dataframe(fit) model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id)) df_model_fit.to_csv(model_fit_file, index=False) # save the SKLL model to a file model_file = join(csvdir, '{}.model'.format(experiment_id)) learner.save(model_file) return learner
def validate_config(cls, config, context='rsmtool'): """ Validate configuration file. Ensure that all required fields are specified, add default values values for all unspecified fields, and ensure that all specified fields are valid. Parameters ---------- context : str, optional Context of the tool in which we are validating. Possible values are :: {'rsmtool', 'rsmeval', 'rsmpredict', 'rsmcompare', 'rsmsummarize'} Defaults to 'rsmtool'. inplace : bool Maintain the state of the config object produced by this method. Defaults to True. Returns ------- config_obj : Configuration A configuration object Raises ------ ValueError If config does not exist, and no config passed. """ # make a copy of the given parameter dictionary new_config = deepcopy(config) # 1. Check to make sure all required fields are specified required_fields = CHECK_FIELDS[context]['required'] for field in required_fields: if field not in new_config: raise ValueError("The config file must " "specify '{}'".format(field)) # 2. Add default values for unspecified optional fields # for given RSMTool context defaults = DEFAULTS for field in defaults: if field not in new_config: new_config[field] = defaults[field] # 3. Check to make sure no unrecognized fields are specified for field in new_config: if field not in defaults and field not in required_fields: raise ValueError("Unrecognized field '{}'" " in json file".format(field)) # 4. Check to make sure that the ID fields that will be # used as part of filenames are formatted correctly # i.e., they do not contain any spaces and are < 200 characters id_field = ID_FIELDS[context] id_field_values = {id_field: new_config[id_field]} for id_field, id_field_value in id_field_values.items(): if len(id_field_value) > 200: raise ValueError("{} is too long (must be " "<=200 characters)".format(id_field)) if re.search(r'\s', id_field_value): raise ValueError("{} cannot contain any " "spaces".format(id_field)) # 5. Check that the feature file and feature subset/subset file are not # specified together msg = ("You cannot specify BOTH \"features\" and \"{}\". " "Please refer to the \"Selecting Feature Columns\" " "section in the documentation for more details.") if new_config['features'] and new_config['feature_subset_file']: msg = msg.format("feature_subset_file") raise ValueError(msg) if new_config['features'] and new_config['feature_subset']: msg = msg.format("feature_subset") raise ValueError(msg) # 6. Check for fields that require feature_subset_file and try # to use the default feature file if (new_config['feature_subset'] and not new_config['feature_subset_file']): # Check if we have the default subset file from rsmextra if HAS_RSMEXTRA: default_basename = Path(default_feature_subset_file).name new_config['feature_subset_file'] = default_feature_subset_file logging.warning("You requested feature subsets but did not " "specify any feature file. " "The tool will use the default " "feature file {} available via " "rsmextra".format(default_basename)) else: raise ValueError("If you want to use feature subsets, you " "must specify a feature subset file") if new_config['sign'] and not new_config['feature_subset_file']: # Check if we have the default subset file from rsmextra if HAS_RSMEXTRA: default_basename = Path(default_feature_subset_file).name new_config['feature_subset_file'] = default_feature_subset_file logging.warning("You specified the expected sign of " "correlation but did not specify a feature " "subset file. The tool will use " "the default feature subset file {} " "available via " "rsmextra".format(default_basename)) else: raise ValueError("If you want to specify the expected sign of " " correlation for each feature, you must " "specify a feature subset file") # Use the default sign if we are using the default feature file # and sign has not been specified in the config file if HAS_RSMEXTRA: default_feature = default_feature_subset_file if (new_config['feature_subset_file'] == default_feature and not new_config['sign']): new_config['sign'] = default_feature_sign # 7. Check for fields that must be specified together if (new_config['min_items_per_candidate'] and not new_config['candidate_column']): raise ValueError("If you want to filter out candidates with " "responses to less than X items, you need " "to specify the name of the column which " "contains candidate IDs.") # 8. Check that if "skll_objective" is specified, it's # one of the metrics that SKLL allows for AND that it is # specified for a SKLL model and _not_ a built-in # linear regression model if new_config['skll_objective']: if not is_skll_model(new_config['model']): warnings.warn( "You specified a custom SKLL objective but also chose a " "non-SKLL model. The objective will be ignored.") else: if new_config['skll_objective'] not in SCORERS: raise ValueError( "Invalid SKLL objective. Please refer to the SKLL " "documentation and choose a valid tuning objective.") # 9. Check that if "skll_fixed_parameters" is specified, # it's specified for SKLL model and _not_ a built-in linear # regression model; we cannot check whether the parameters # are valid at parse time but SKLL will raise an error # at run time for any invalid parameters if new_config['skll_fixed_parameters']: if not is_skll_model(new_config['model']): warnings.warn( "You specified custom SKLL fixed parameters but " "also chose a non-SKLL model. The parameters will " "be ignored.") # 10. Check that if we are running rsmtool to ask for # expected scores then the SKLL model type must actually # support probabilistic classification. If it's not a SKLL # model at all, we just treat it as a LinearRegression model # which is basically what they all are in the end. if context == 'rsmtool' and new_config['predict_expected_scores']: model_name = new_config['model'] dummy_learner = Learner(model_name) if is_skll_model( model_name) else Learner('LinearRegression') if not hasattr(dummy_learner.model_type, 'predict_proba'): raise ValueError( "{} does not support expected scores " "since it is not a probablistic classifier.".format( model_name)) del dummy_learner # 11. Check the fields that requires rsmextra if not HAS_RSMEXTRA: if new_config['special_sections']: raise ValueError("Special sections are only available to ETS" " users by installing the rsmextra package.") # 12. Raise a warning if we are specifiying a feature file but also # telling the system to automatically select transformations if new_config['features'] and new_config['select_transformations']: # Show a warning unless a user passed a list of features. if not isinstance(new_config['features'], list): warnings.warn("You specified a feature file but also set " "`select_transformations` to True. Any " "transformations or signs specified in " "the feature file will be overwritten by " "the automatically selected transformations " "and signs.") # 13. If we have `experiment_names`, check that the length of the list # matches the list of experiment_dirs. if context == 'rsmsummarize' and new_config['experiment_names']: if len(new_config['experiment_names']) != len( new_config['experiment_dirs']): raise ValueError( "The number of specified experiment names should be the same" " as the number of specified experiment directories.") # 14. Check that if the user specified min_n_per_group, they also # specified subgroups. If they supplied a dictionary, make # sure the keys match if new_config['min_n_per_group']: # make sure we have subgroups if 'subgroups' not in new_config: raise ValueError("You must specify a list of subgroups in " "in the `subgroups` field if " "you want to use the `min_n_per_group` field") # if we got dictionary, make sure the keys match elif isinstance(new_config['min_n_per_group'], dict): if sorted(new_config['min_n_per_group'].keys()) != sorted( new_config['subgroups']): raise ValueError( "The keys in `min_n_per_group` must " "match the subgroups in `subgroups` field") # else convert to dictionary else: new_config['min_n_per_group'] = { group: new_config['min_n_per_group'] for group in new_config['subgroups'] } # 15. Clean up config dict to keep only context-specific fields context_relevant_fields = (CHECK_FIELDS[context]['optional'] + CHECK_FIELDS[context]['required']) new_config = { k: v for k, v in new_config.items() if k in context_relevant_fields } return new_config
def validate_config(self, context='rsmtool', inplace=True): """ Ensure that all required fields are specified, add default values values for all unspecified fields, and ensure that all specified fields are valid. Parameters ---------- context : str, optional Context of the tool in which we are validating. Possible values are :: {'rsmtool', 'rsmeval', 'rsmpredict', 'rsmcompare', 'rsmsummarize'} Defaults to 'rsmtool'. inplace : bool Maintain the state of the config object produced by this method. Defaults to True. Returns ------- config_obj : Configuration A configuration object Raises ------ ValueError If config does not exist, and no config passed. """ # Check to make sure a configuration file # or dictionary has been loaded. self._check_config_is_loaded() # Get the parameter dictionary new_config = self._config # 1. Check to make sure all required fields are specified required_fields = CHECK_FIELDS[context]['required'] for field in required_fields: if field not in new_config: raise ValueError("The config file must " "specify '{}'".format(field)) # 2. Add default values for unspecified optional fields # for given RSMTool context defaults = DEFAULTS for field in defaults: if field not in new_config: new_config[field] = defaults[field] # 3. Check to make sure no unrecognized fields are specified for field in new_config: if field not in defaults and field not in required_fields: raise ValueError("Unrecognized field '{}'" " in json file".format(field)) # 4. Check to make sure that the ID fields that will be # used as part of filenames formatted correctly id_fields = ['comparison_id', 'experiment_id', 'summary_id'] id_field_values = { field: new_config[field] for field in new_config if field in id_fields } # we do not need to validate any IDs for `rsmpredict` self.check_id_fields(id_field_values) # 5. Check that the feature file and feature subset/subset file are not # specified together msg = ("You cannot specify BOTH \"features\" and \"{}\". " "Please refer to the \"Selecting Feature Columns\" " "section in the documentation for more details.") if new_config['features'] and new_config['feature_subset_file']: msg = msg.format("feature_subset_file") raise ValueError(msg) if new_config['features'] and new_config['feature_subset']: msg = msg.format("feature_subset") raise ValueError(msg) # 6. Check for fields that require feature_subset_file and try # to use the default feature file if (new_config['feature_subset'] and not new_config['feature_subset_file']): # Check if we have the default subset file from rsmextra if HAS_RSMEXTRA: default_basename = basename(default_feature_subset_file) new_config['feature_subset_file'] = default_feature_subset_file logging.warning("You requested feature subsets but did not " "specify any feature file. " "The tool will use the default " "feature file {} available via " "rsmextra".format(default_basename)) else: raise ValueError("If you want to use feature subsets, you " "must specify a feature subset file") if new_config['sign'] and not new_config['feature_subset_file']: # Check if we have the default subset file from rsmextra if HAS_RSMEXTRA: default_basename = basename(default_feature_subset_file) new_config['feature_subset_file'] = default_feature_subset_file logging.warning("You specified the expected sign of " "correlation but did not specify a feature " "subset file. The tool will use " "the default feature subset file {} " "available via " "rsmextra".format(default_basename)) else: raise ValueError("If you want to specify the expected sign of " " correlation for each feature, you must " "specify a feature subset file") # Use the default sign if we are using the default feature file # and sign has not been specified in the config file if HAS_RSMEXTRA: default_feature = default_feature_subset_file if (new_config['feature_subset_file'] == default_feature and not new_config['sign']): new_config['sign'] = default_feature_sign # 7. Check for fields that must be specified together if (new_config['min_items_per_candidate'] and not new_config['candidate_column']): raise ValueError("If you want to filter out candidates with " "responses to less than X items, you need " "to specify the name of the column which " "contains candidate IDs.") # 8. Check that if "skll_objective" is specified, it's # one of the metrics that SKLL allows for AND that it is # specified for a SKLL model and _not_ a built-in # linear regression model if new_config['skll_objective']: if not is_skll_model(new_config['model']): logging.warning( "You specified a custom SKLL objective but also chose a " "non-SKLL model. The objective will be ignored.") else: if new_config['skll_objective'] not in SCORERS: raise ValueError( "Invalid SKLL objective. Please refer to the SKLL " "documentation and choose a valid tuning objective.") # 9. Check that if we are running rsmtool to ask for # expected scores then the SKLL model type must actually # support probabilistic classification. If it's not a SKLL # model at all, we just treat it as a LinearRegression model # which is basically what they all are in the end. if context == 'rsmtool' and new_config['predict_expected_scores']: model_name = new_config['model'] dummy_learner = Learner(model_name) if is_skll_model( model_name) else Learner('LinearRegression') if not hasattr(dummy_learner.model_type, 'predict_proba'): raise ValueError( "{} does not support expected scores " "since it is not a probablistic classifier.".format( model_name)) del dummy_learner # 10. Check the fields that requires rsmextra if not HAS_RSMEXTRA: if new_config['special_sections']: raise ValueError("Special sections are only available to ETS" " users by installing the rsmextra package.") # 11. Raise a warning if we are specifiying a feature file but also # telling the system to automatically select transformations if new_config['features'] and new_config['select_transformations']: logging.warning("You specified a feature file but also set " "`select_transformations` to True. Any " "transformations or signs specified in " "the feature file will be overwritten by " "the automatically selected transformations " "and signs.") # 12. Clean up config dict to keep only context-specific fields context_relevant_fields = (CHECK_FIELDS[context]['optional'] + CHECK_FIELDS[context]['required']) new_config = { k: v for k, v in new_config.items() if k in context_relevant_fields } if inplace: self._config = new_config return Configuration(self._config, self._filepath)
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir): # get the columns that actually contain the feature values feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']] # LinearRegression (formerly empWt) : simple linear regression if model_name == 'LinearRegression': # get the feature columns X = df_train[feature_columns] # add the intercept X = sm.add_constant(X) # fit the model fit = sm.OLS(df_train['sc1'], X).fit() df_coef = ols_coefficients_to_dataframe(fit.params) learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # EqualWeightsLR (formerly eqWt) : all features get equal weight elif model_name == 'EqualWeightsLR': # we first compute a single feature that is simply the sum of all features df_train_eqwt = df_train.copy() df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1) # train a plain Linear Regression model X = df_train_eqwt['sumfeature'] X = sm.add_constant(X) fit = sm.OLS(df_train_eqwt['sc1'], X).fit() # get the coefficient for the summed feature and the intercept coef = fit.params['sumfeature'] const = fit.params['const'] # now we need to assign this coefficient to all of the original # features and create a fake SKLL learner with these weights original_features = [c for c in df_train_eqwt.columns if c not in ['sc1', 'sumfeature', 'spkitemid']] coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)])) df_coef = ols_coefficients_to_dataframe(coefs) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # RebalancedLR (formerly empWtBalanced) : balanced empirical weights # by changing betas [adapted from http://bit.ly/UTP7gS] elif model_name == 'RebalancedLR': # train a plain Linear Regression model X = df_train[feature_columns] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_params = ols_coefficients_to_dataframe(fit.params) df_params = df_params.set_index('feature') # compute the betas for the non-intercept coefficients df_weights = df_params.loc[feature_columns] df_betas = df_weights.copy() df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std() # replace each negative beta with delta and adjust # all the positive betas to account for this RT = 0.05 df_positive_betas = df_betas[df_betas['coefficient'] > 0] df_negative_betas = df_betas[df_betas['coefficient'] < 0] delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas) df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1) # rescale the adjusted betas to get the new coefficients df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index') # add the intercept back to the new coefficients df_coef['Intercept'] = df_params.loc['Intercept'].coefficient df_coef = df_coef.sort_index().reset_index() df_coef.columns = ['feature', 'coefficient'] # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature # selection using lasso regression with a fixed lambda and then # use only those features to train a second linear regression elif model_name == 'LassoFixedLambdaThenLR': # train a Lasso Regression model with this featureset with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train a new vanilla linear regression with just the non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # get the coefficients data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature # selection using lasso regression optimized for log likelihood using # cross validation and then use only those features to train a # second linear regression elif model_name == 'PositiveLassoCVThenLR': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # NNLR (formerly empWtNNLS) : First do feature selection using # non-negative least squares (NNLS) and then use only its non-zero # features to train a regular linear regression. We do the regular # LR at the end since we want an LR object so that we have access # to R^2 and other useful statistics. There should be no difference # between the non-zero coefficients from NNLS and the coefficients # that end up coming out of the subsequent LR. elif model_name == 'NNLR': # add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters to a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # LassoFixedLambdaThenNNLS (formerly empWtDropNegLasso): First do # feature selection using lasso regression and positive only weights. # Then fit an NNLR (see above) on those features. elif model_name == 'LassoFixedLambdaThenNNLR': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train an NNLS regression using these non-zero features # first add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the positive features used_features = non_zero_features # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with # a fixed lambda elif model_name == 'LassoFixedLambda': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation alpha = p_lambda / len(df_train) learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True}) learner.train(fs_train, grid_search=False) # convert this model's parameters to a data frame df_coef = skll_learner_params_to_dataframe(learner) # there's no OLS fit object in this case fit = None # we used all the features used_features = feature_columns # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection # using lasso regression optimized for log likelihood using cross # validation. elif model_name == 'PositiveLassoCV': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # save the non-zero model coefficients and intercept to a data frame non_zero_features, non_zero_feature_values = [], [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) non_zero_feature_values.append(coefficient) # initialize the coefficient data frame with just the intercept df_coef = pd.DataFrame([('Intercept', model.intercept_)]) df_coef = df_coef.append(list(zip(non_zero_features, non_zero_feature_values)), ignore_index=True) df_coef.columns = ['feature', 'coefficient'] # create a fake SKLL learner with these non-zero weights learner = create_fake_skll_learner(df_coef) # there's no OLS fit object in this case fit = None # we used only the non-zero features used_features = non_zero_features # save the raw coefficients to a file df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False) # compute the standardized and relative coefficients (betas) for the # non-intercept features and save to a file df_betas = df_coef.set_index('feature').loc[used_features] df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std() df_betas.columns = ['standardized'] df_betas['relative'] = df_betas / sum(abs(df_betas['standardized'])) df_betas.reset_index(inplace=True) df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False) # save the OLS fit object and its summary to files if fit: ols_file = join(csvdir, '{}.ols'.format(experiment_id)) summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id)) with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf: pickle.dump(fit, olsf) summf.write(str(fit.summary())) # save the SKLL model to a file model_file = join(csvdir, '{}.model'.format(experiment_id)) learner.save(model_file) return learner
def compute_and_save_predictions(config_file, output_file, feats_file): """ Generate predictions using the information in the config file and save them into the given output file. """ logger = logging.getLogger(__name__) # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmpredict') # get the directory where the config file lives # if this is the 'expm' directory, then go # up one level. configpath = dirname(config_file) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = locate_file(config_obj['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist'.format( config_obj['input_features_file'])) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the column name that will hold the ID id_column = config_obj['id_column'] # get the column name for human score (if any) human_score_column = config_obj['human_score_column'] # get the column name for second human score (if any) second_human_score_column = config_obj['second_human_score_column'] # get the column name for subgroups (if any) subgroups = config_obj['subgroups'] # get the column names for flag columns (if any) flag_column_dict = check_flag_column(config_obj) # get the name for the candidate_column (if any) candidate_column = config_obj['candidate_column'] # get the directory of the experiment experiment_dir = locate_file(config_obj['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.'.format( config_obj['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError( 'The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError( 'The directory {} does not contain any rsmtool models.'.format( experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError( '{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format( experiment_output_dir, expected_file_name)) # read in the given features but make sure that the # `id_column`, `candidate_column` and subgroups are read in as a string logger.info('Reading features from {}'.format(input_features_file)) string_columns = [id_column, candidate_column] + subgroups converter_dict = dict([(column, str) for column in string_columns if column]) df_input = pd.read_csv(input_features_file, converters=converter_dict) # make sure that the columns specified in the config file actually exist columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys()) # add subgroups and the flag columns to the list of columns # that will be added to the final file columns_to_copy = subgroups + list(flag_column_dict.keys()) # human_score_column will be set to sc1 by default # we only raise an error if it's set to something else. # However, since we cannot distinguish whether the column was set # to sc1 by default or specified as such in the config file # we append it to output anyway as long as # it is in the input file if human_score_column != 'sc1' or 'sc1' in df_input.columns: columns_to_check.append(human_score_column) columns_to_copy.append('sc1') if candidate_column: columns_to_check.append(candidate_column) columns_to_copy.append('candidate') if second_human_score_column: columns_to_check.append(second_human_score_column) columns_to_copy.append('sc2') missing_columns = set(columns_to_check).difference(df_input.columns) if missing_columns: raise KeyError("Columns {} from the config file " "do not exist in the data.".format(missing_columns)) # rename all columns df_input = rename_default_columns(df_input, [], id_column, human_score_column, second_human_score_column, None, None, candidate_column=candidate_column) # check that the id_column contains unique values if df_input['spkitemid'].size != df_input['spkitemid'].unique().size: raise ValueError( "The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool." .format(id_column)) # now we need to pre-process these features using # the parameters that are already stored in the # _features.csv file. df_feature_info = pd.read_csv(join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)), index_col=0) required_features = df_feature_info.index.tolist() # ensure that all the features that are needed by the model # are present in the input file input_feature_columns = [c for c in df_input if c != id_column] missing_features = set(required_features).difference(input_feature_columns) if missing_features: raise KeyError('{} is missing the following features: {}'.format( feats_file, missing_features)) extra_features = set(input_feature_columns).difference(required_features + [id_column]) if extra_features: logging.warning( 'The following extraenous features will be ignored: {}'.format( extra_features)) # keep the required features plus the id features_to_keep = ['spkitemid'] + required_features # check if actually have the human scores for this data and add # sc1 to preprocessed features for consistency with other tools has_human_scores = 'sc1' in df_input if has_human_scores: features_to_keep.append('sc1') df_features = df_input[features_to_keep] # preprocess the feature values logger.info('Pre-processing input features') # first we need to filter out NaNs and any other # weird features, the same way we did for rsmtool. df_filtered = df_features.copy() df_excluded = pd.DataFrame(columns=df_filtered.columns) for feature_name in required_features: newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid', exclude_zeros=False, exclude_zero_sd=False) del df_filtered df_filtered = newdf df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer') # make sure that the remaining data frame is not empty if len(df_filtered) == 0: raise ValueError( "There are no responses left after " "filtering out non-numeric feature values. No analysis " "will be run") df_features = df_filtered.copy() df_features_preprocessed = df_features.copy() for feature_name in required_features: feature_values = df_features[feature_name].values feature_transformation = df_feature_info.loc[feature_name]['transform'] feature_weight = df_feature_info.loc[feature_name]['sign'] train_feature_mean = df_feature_info.loc[feature_name]['train_mean'] train_feature_sd = df_feature_info.loc[feature_name]['train_sd'] train_transformed_mean = df_feature_info.loc[feature_name][ 'train_transformed_mean'] train_transformed_sd = df_feature_info.loc[feature_name][ 'train_transformed_sd'] # transform the feature values and remove outliers df_features_preprocessed[feature_name] = preprocess_feature( feature_values, feature_name, feature_transformation, train_feature_mean, train_feature_sd, exclude_zero_sd=False) # now standardize the feature values df_features_preprocessed[feature_name] = ( df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd # Multiply features by weight. Within the # current SR timeline, the mean of the transformed train # feature used to standardize test features has to be # computed before multiplying the train feature by the weight. df_features_preprocessed[feature_name] = df_features_preprocessed[ feature_name] * feature_weight # save the pre-processed features to disk if we were asked to if feats_file: logger.info( 'Saving pre-processed feature values to {}'.format(feats_file)) # create any directories needed for the output file os.makedirs(dirname(feats_file), exist_ok=True) df_features_preprocessed.to_csv(feats_file, index=False) # now load the SKLL model to generate the predictions model = Learner.from_file( join(experiment_output_dir, '{}.model'.format(experiment_id))) # now generate the predictions for the features using this model logger.info('Generating predictions') df_predictions = predict_with_model(model, df_features_preprocessed) # read in the post-processing parameters from disk df_postproc_params = pd.read_csv( join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id))) trim_min = df_postproc_params['trim_min'].values[0] trim_max = df_postproc_params['trim_max'].values[0] h1_mean = df_postproc_params['h1_mean'].values[0] h1_sd = df_postproc_params['h1_sd'].values[0] train_predictions_mean = df_postproc_params[ 'train_predictions_mean'].values[0] train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0] # now scale the predictions logger.info('Rescaling predictions') scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd scaled_predictions = scaled_predictions * h1_sd + h1_mean df_predictions['scale'] = scaled_predictions # trim and round the predictions logger.info('Trimming and rounding predictions') df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max) df_predictions['raw_trim_round'] = np.rint( df_predictions['raw_trim']).astype('int64') df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max) df_predictions['scale_trim_round'] = np.rint( df_predictions['scale_trim']).astype('int64') # add back the columns that we were requested to copy if any if columns_to_copy: df_predictions_with_metadata = pd.merge( df_predictions, df_input[['spkitemid'] + columns_to_copy]) assert (len(df_predictions) == len(df_predictions_with_metadata)) else: df_predictions_with_metadata = df_predictions.copy() # create any directories needed for the output file os.makedirs(dirname(output_file), exist_ok=True) # save the predictions to disk logger.info('Saving predictions to {}'.format(output_file)) df_predictions_with_metadata.to_csv(output_file, index=False) # save excluded responses to disk if not df_excluded.empty: excluded_output_file = '{}_excluded_responses{}'.format( *splitext(output_file)) logger.info( 'Saving excluded responses to {}'.format(excluded_output_file)) df_excluded.to_csv(excluded_output_file, index=False)
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ parser = argparse.ArgumentParser( description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') group = parser.add_mutually_exclusive_group() group.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) group.add_argument("--sort_by_labels", '-s', action='store_true', default=False, help="order the features by classes") parser.add_argument( '--sign', choices=['positive', 'negative', 'all'], default='all', help='show only positive, only negative or all weights') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params multiclass = False model = learner._model if (isinstance(model, LinearSVC) or (isinstance(model, LogisticRegression) and len(learner.label_list) > 2) or (isinstance(model, SVC) and model.kernel == 'linear')): multiclass = True weight_items = weights.items() if args.sign == 'positive': weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == 'negative': weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if '_intercept_' in intercept: # Some learners (e.g. LinearSVR) may return an array of intercepts but # sometimes that array is of length 1 so we don't need to print that # as an array/list. First, let's normalize these cases. model_intercepts = intercept['_intercept_'] intercept_is_array = isinstance(model_intercepts, np.ndarray) num_intercepts = len(model_intercepts) if intercept_is_array else 1 if intercept_is_array and num_intercepts == 1: model_intercepts = model_intercepts[0] intercept_is_array = False # now print out the intercepts print("intercept = {:.12f}".format(model_intercepts)) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{: .12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) weight_by_class = defaultdict(dict) if multiclass and args.sort_by_labels: for label_feature, weight in weight_items: label, feature = label_feature.split() weight_by_class[label][feature] = weight for label in sorted(weight_by_class): for feat, val in sorted(weight_by_class[label].items(), key=lambda x: -abs(x[1])): print("{: .12f}\t{}\t{}".format(val, label, feat)) else: for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{: .12f}\t{}".format(val, feat))