def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): def _params_dict(l2_normalization, speaker_id_name=FLAGS.speaker_id_name, elem=elem): return { 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, 'l2_normalization': l2_normalization, 'speaker_id_name': speaker_id_name, 'save_model_dir': FLAGS.save_model_dir, 'save_predictions_dir': FLAGS.save_predictions_dir, 'eval_metric': FLAGS.eval_metric, } exp_params.append(_params_dict(l2_normalization=True)) exp_params.append(_params_dict(l2_normalization=False)) if FLAGS.speaker_id_name is not None: exp_params.append( _params_dict(l2_normalization=True, speaker_id_name=None)) exp_params.append( _params_dict(l2_normalization=False, speaker_id_name=None)) # Make and run beam pipeline. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map(lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): exp_params.append({ 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, # Either L2 normalization or speaker normalization. You could try both # if you wanted. 'l2_normalization': FLAGS.speaker_id_name is None, 'speaker_id_name': FLAGS.speaker_id_name, 'save_model_dir': FLAGS.save_model_dir, 'calculate_equal_error_rate': FLAGS.calculate_equal_error_rate, }) # Make and run beam pipeline. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map(lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
def experiment_params( embedding_list, speaker_id_name, label_name, label_list, train_glob, eval_glob, test_glob, save_model_dir, save_predictions_dir, eval_metrics, comma_escape_char = '?', ): """Get experiment params.""" # Sometimes we want commas to appear in `embedding_modules`, # `embedding_names`, or `module_output_key`. However, commas get split out in # Google's Python `DEFINE_list`. We compromise by introducing a special # character, which we replace with commas here. embedding_list = _maybe_add_commas(embedding_list, comma_escape_char) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[embedding_list, model_names]): def _params_dict(l2_normalization, speaker_id_name=speaker_id_name, elem=elem): return { 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': label_name, 'label_list': label_list, 'train_glob': train_glob, 'eval_glob': eval_glob, 'test_glob': test_glob, 'l2_normalization': l2_normalization, 'speaker_id_name': speaker_id_name, 'save_model_dir': save_model_dir, 'save_predictions_dir': save_predictions_dir, 'eval_metrics': eval_metrics, } exp_params.append(_params_dict(l2_normalization=True)) exp_params.append(_params_dict(l2_normalization=False)) if speaker_id_name is not None: exp_params.append( _params_dict(l2_normalization=True, speaker_id_name=None)) exp_params.append( _params_dict(l2_normalization=False, speaker_id_name=None)) return exp_params
def test_sklearn_models_sanity(self, model_name): # Set random seed according to: # https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development. np.random.seed(42) rn.seed(42) model = models.get_sklearn_models()[model_name]() # Actually train. inputs, targets = _get_some_data(9000) model.fit(inputs, targets) # Check that performance is near perfect. inputs, targets = _get_some_data(512) acc = model.score(inputs, targets) expected = 0.5 if 'forest' in model_name.lower() else 0.9 self.assertGreater(acc, expected) logging.info('%s final acc: %f', model, acc)
def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): exp_params.append({ 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, # Either L2 normalization or speaker normalization. You could try both # if you wanted. 'l2_normalization': FLAGS.speaker_id_name is None, 'speaker_id_name': FLAGS.speaker_id_name, }) # Make and run beam pipeline. p = beam.Pipeline() _ = (p | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map( lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1)) result = p.run() result.wait_until_finish()
def train_and_get_score(embedding_name, label_name, label_list, train_glob, eval_glob, test_glob, model_name, l2_normalization, speaker_id_name=None, save_model_dir=None): """Train and eval sklearn models on data. Args: embedding_name: Name of embedding. label_name: Name of label to use. label_list: Python list of all values for label. train_glob: Location of training data, as tf.Examples. eval_glob: Location of eval data, as tf.Examples. test_glob: Location of test data, as tf.Examples. model_name: Name of model. l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm. speaker_id_name: `None`, or name of speaker ID field. save_model_dir: If not `None`, write sklearn models to this directory. Returns: A Python float, of the accuracy on the eval set. """ def _cur_s(s): return time.time() - s def _cur_m(s): return (time.time() - s) / 60.0 # Read and validate data. def _read_glob(glob, name): s = time.time() npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name, label_name, label_list, l2_normalization, speaker_id_name) logging.info('Finished reading %s data: %.2f sec.', name, _cur_s(s)) return npx, npy npx_train, npy_train = _read_glob(train_glob, 'train') npx_eval, npy_eval = _read_glob(eval_glob, 'eval') npx_test, npy_test = _read_glob(test_glob, 'test') # Sanity check npx_*. assert npx_train.size > 0 assert npx_eval.size > 0 assert npx_test.size > 0 # Sanity check npy_train. assert npy_train.size > 0 assert np.unique(npy_train).size > 1 # Sanity check npy_eval. assert npy_eval.size > 0 assert np.unique(npy_eval).size > 1 # Sanity check npy_test. assert npy_test.size > 0 assert np.unique(npy_test).size > 1 # Train models. d = models.get_sklearn_models()[model_name]() logging.info('Made model.') s = time.time() d.fit(npx_train, npy_train) logging.info('Trained model: %.2f min', _cur_m(s)) # Eval. eval_score = d.score(npx_eval, npy_eval) logging.info('%s: %.3f', model_name, eval_score) # Test. test_score = d.score(npx_test, npy_test) logging.info('%s: %.3f', model_name, test_score) # If `save_model_dir` is present, write model to this directory. # To load the model after saving, use: # ```python # with file_utils.Open(model_filename, 'rb') as f: # m = pickle.load(f) # ``` if save_model_dir: file_utils.MaybeMakeDirs(save_model_dir) model_filename = os.path.join(save_model_dir, f'{model_name}.pickle') with file_utils.Open(model_filename, 'wb') as f: pickle.dump(d, f) return (eval_score, test_score)
def train_and_get_score( embedding_name, label_name, label_list, train_glob, eval_glob, test_glob, model_name, l2_normalization, speaker_id_name = None, save_model_dir = None, save_predictions_dir = None, eval_metrics = ('accuracy',) ): """Train and eval sklearn models on data. Args: embedding_name: Name of embedding. label_name: Name of label to use. label_list: Python list of all values for label. train_glob: Location of training data, as tf.Examples. eval_glob: Location of eval data, as tf.Examples. test_glob: Location of test data, as tf.Examples. model_name: Name of model. l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm. speaker_id_name: `None`, or name of speaker ID field. save_model_dir: If not `None`, write sklearn models to this directory. save_predictions_dir: If not `None`, write numpy array of predictions on train, eval, and test into this directory. eval_metrics: Iterable of string names of the desired evaluation metrics. Returns: A dict: {metric name: (eval metric, test metric)} """ def _cur_s(s): return time.time() - s def _cur_m(s): return (time.time() - s) / 60.0 # Read and validate data. def _read_glob(glob, name): logging.info('Starting to read %s: %s', name, glob) s = time.time() npx, npy, _ = sklearn_utils.tfexamples_to_nps(glob, embedding_name, label_name, label_list, l2_normalization, speaker_id_name) logging.info('Finished reading %s %s data: %.2f sec.', embedding_name, name, _cur_s(s)) return npx, npy npx_train, npy_train = _read_glob(train_glob, 'train') npx_eval, npy_eval = _read_glob(eval_glob, 'eval') npx_test, npy_test = _read_glob(test_glob, 'test') # Sanity check npx_*. assert npx_train.size > 0 assert npx_eval.size > 0 assert npx_test.size > 0 # Sanity check npy_train. assert npy_train.size > 0 assert np.unique(npy_train).size > 1 # Sanity check npy_eval. assert npy_eval.size > 0 assert np.unique(npy_eval).size > 1 # Sanity check npy_test. assert npy_test.size > 0 assert np.unique(npy_test).size > 1 # If `save_model_dir` is present and the model exists, load the model instead # of training. if save_model_dir: cur_models_dir = os.path.join(save_model_dir, embedding_name) tf.io.gfile.makedirs(cur_models_dir) model_filename = os.path.join(cur_models_dir, f'{model_name}.pickle') train_model = not tf.io.gfile.exists(model_filename) else: train_model = True # Train models. if train_model: d = models.get_sklearn_models()[model_name]() logging.info('Made model: %s.', model_name) s = time.time() d.fit(npx_train, npy_train) logging.info('Trained model: %s, %s: %.2f min', model_name, embedding_name, _cur_m(s)) # If `save_model_dir` is present and the model exists, write model to this # directory. if save_model_dir: with tf.io.gfile.GFile(model_filename, 'wb') as f: pickle.dump(d, f) else: # Load model. with tf.io.gfile.GFile(model_filename, 'rb') as f: d = pickle.load(f) scores = {} for eval_metric in eval_metrics: eval_score, test_score = _calc_scores(eval_metric, d, npx_eval, npy_eval, npx_test, npy_test, label_list) logging.info('Finished eval: %s: %.3f', model_name, eval_score) logging.info('Finished test: %s: %.3f', model_name, test_score) scores[eval_metric] = (eval_score, test_score) if save_predictions_dir: cur_preds_dir = os.path.join(save_predictions_dir, embedding_name) tf.io.gfile.makedirs(cur_preds_dir) for dat_name, dat_x, dat_y in [('train', npx_train, npy_train), ('eval', npx_eval, npy_eval), ('test', npx_test, npy_test)]: pred_filename = os.path.join(cur_preds_dir, f'{model_name}_{dat_name}_pred.npz') pred_y = d.predict(dat_x) with tf.io.gfile.GFile(pred_filename, 'wb') as f: np.save(f, pred_y) y_filename = os.path.join(cur_preds_dir, f'{model_name}_{dat_name}_y.npz') with tf.io.gfile.GFile(y_filename, 'wb') as f: np.save(f, dat_y) return scores
def train_and_get_score(embedding_name, label_name, label_list, train_glob, eval_glob, test_glob, model_name, l2_normalization, speaker_id_name=None, save_model_dir=None, save_predictions_dir=None, eval_metric='accuracy'): """Train and eval sklearn models on data. Args: embedding_name: Name of embedding. label_name: Name of label to use. label_list: Python list of all values for label. train_glob: Location of training data, as tf.Examples. eval_glob: Location of eval data, as tf.Examples. test_glob: Location of test data, as tf.Examples. model_name: Name of model. l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm. speaker_id_name: `None`, or name of speaker ID field. save_model_dir: If not `None`, write sklearn models to this directory. save_predictions_dir: If not `None`, write numpy array of predictions on train, eval, and test into this directory. eval_metric: String name of the desired evaluation metric. Returns: A tuple of Python floats, (eval metric, test metric). """ def _cur_s(s): return time.time() - s def _cur_m(s): return (time.time() - s) / 60.0 # Read and validate data. def _read_glob(glob, name): s = time.time() npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name, label_name, label_list, l2_normalization, speaker_id_name) logging.info('Finished reading %s %s data: %.2f sec.', embedding_name, name, _cur_s(s)) return npx, npy npx_train, npy_train = _read_glob(train_glob, 'train') npx_eval, npy_eval = _read_glob(eval_glob, 'eval') npx_test, npy_test = _read_glob(test_glob, 'test') # Sanity check npx_*. assert npx_train.size > 0 assert npx_eval.size > 0 assert npx_test.size > 0 # Sanity check npy_train. assert npy_train.size > 0 assert np.unique(npy_train).size > 1 # Sanity check npy_eval. assert npy_eval.size > 0 assert np.unique(npy_eval).size > 1 # Sanity check npy_test. assert npy_test.size > 0 assert np.unique(npy_test).size > 1 # Train models. d = models.get_sklearn_models()[model_name]() logging.info('Made model: %s.', model_name) s = time.time() d.fit(npx_train, npy_train) logging.info('Trained model: %s, %s: %.2f min', model_name, embedding_name, _cur_m(s)) eval_score, test_score = _calc_eval_scores(eval_metric, d, npx_eval, npy_eval, npx_test, npy_test) logging.info('Finished eval: %s: %.3f', model_name, eval_score) logging.info('Finished eval: %s: %.3f', model_name, test_score) # If `save_model_dir` is present, write model to this directory. # To load the model after saving, use: # ```python # with file_utils.Open(model_filename, 'rb') as f: # m = pickle.load(f) # ``` if save_model_dir: cur_models_dir = os.path.join(save_model_dir, embedding_name) file_utils.MaybeMakeDirs(cur_models_dir) model_filename = os.path.join(cur_models_dir, f'{model_name}.pickle') with file_utils.Open(model_filename, 'wb') as f: pickle.dump(d, f) if save_predictions_dir: cur_preds_dir = os.path.join(save_predictions_dir, embedding_name) file_utils.MaybeMakeDirs(cur_preds_dir) for dat_name, dat_x, dat_y in [('train', npx_train, npy_train), ('eval', npx_eval, npy_eval), ('test', npx_test, npy_test)]: pred_filename = os.path.join(cur_preds_dir, f'{model_name}_{dat_name}_pred.npz') pred_y = d.predict(dat_x) with file_utils.Open(pred_filename, 'wb') as f: np.save(f, pred_y) y_filename = os.path.join(cur_preds_dir, f'{model_name}_{dat_name}_y.npz') with file_utils.Open(y_filename, 'wb') as f: np.save(f, dat_y) return (eval_score, test_score)
def train_and_get_score(embedding_name, label_name, label_list, train_glob, eval_glob, test_glob, model_name, l2_normalization, speaker_id_name=None): """Train and eval sklearn models on data. Args: embedding_name: Name of embedding. label_name: Name of label to use. label_list: Python list of all values for label. train_glob: Location of training data, as tf.Examples. eval_glob: Location of eval data, as tf.Examples. test_glob: Location of test data, as tf.Examples. model_name: Name of model. l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm. speaker_id_name: `None`, or name of speaker ID field. Returns: A Python float, of the accuracy on the eval set. """ def _cur_s(s): return time.time() - s def _cur_m(s): return (time.time() - s) / 60.0 # Read and validate data. def _read_glob(glob, name): s = time.time() npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name, label_name, label_list, l2_normalization, speaker_id_name) logging.info('Finished reading %s data: %.2f sec.', name, _cur_s(s)) return npx, npy npx_train, npy_train = _read_glob(train_glob, 'train') npx_eval, npy_eval = _read_glob(eval_glob, 'eval') npx_test, npy_test = _read_glob(test_glob, 'test') # Sanity check npx_*. assert npx_train.size > 0 assert npx_eval.size > 0 assert npx_test.size > 0 # Sanity check npy_train. assert npy_train.size > 0 assert np.unique(npy_train).size > 1 # Sanity check npy_eval. assert npy_eval.size > 0 assert np.unique(npy_eval).size > 1 # Sanity check npy_test. assert npy_test.size > 0 assert np.unique(npy_test).size > 1 # Train models. d = models.get_sklearn_models()[model_name]() logging.info('Made model.') s = time.time() d.fit(npx_train, npy_train) logging.info('Trained model: %.2f min', _cur_m(s)) # Eval. eval_score = d.score(npx_eval, npy_eval) logging.info('%s: %.3f', model_name, eval_score) # Test. test_score = d.score(npx_test, npy_test) logging.info('%s: %.3f', model_name, test_score) return (eval_score, test_score)