def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path is_binary_embedding = FLAGS.is_binary_embedding text_feature_name = FLAGS.text_feature_name preprocessor = text_preprocessor.TextPreprocessor(embeddings_path, is_binary_embedding) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInput( train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=text_feature_name, labels=LABELS, train_preprocess_fn=train_preprocess_fn, batch_size=FLAGS.batch_size) # TODO: Move embedding *into* Keras model. model = preprocessor.add_embedding_to_model( keras_cnn.KerasCNNModel(set(LABELS.keys())), text_feature_name) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps)
def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path is_binary_embedding = FLAGS.is_binary_embedding text_feature_name = FLAGS.text_feature_name key_name = FLAGS.key_name preprocessor = text_preprocessor.TextPreprocessor(embeddings_path, is_binary_embedding) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInput( train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=text_feature_name, labels=LABELS, train_preprocess_fn=train_preprocess_fn, batch_size=FLAGS.batch_size, max_seq_len=5000) model_tf = tf_word_label_embedding.TFWordLabelEmbeddingModel( text_feature_name, "frac_neg") model = preprocessor.add_embedding_to_model(model_tf, text_feature_name) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps)
def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path text_feature_name = FLAGS.text_feature_name key_name = FLAGS.key_name preprocessor = text_preprocessor.TextPreprocessor(embeddings_path) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInput( train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=text_feature_name, labels=LABELS, train_preprocess_fn=train_preprocess_fn, batch_size=FLAGS.batch_size) # TODO: Move embedding *into* Keras model. model_keras = keras_gru_attention.KerasRNNModel( set(LABELS.keys()), preprocessor._embedding_size) model = preprocessor.add_embedding_to_model( model_keras, text_feature_name) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps) serving_input_fn = serving_input.create_serving_input_fn( word_to_idx=preprocessor._word_to_idx, unknown_token=preprocessor._unknown_token, text_feature_name=text_feature_name, key_name=key_name) trainer.export(serving_input_fn)
def main(argv): del argv # unused dataset = tfrecord_input.TFRecordInput() model = tf_hub_classifier.TFHubClassifierModel(dataset.labels()) trainer = model_trainer.ModelTrainer(dataset, model, warm_start_from=FLAGS.warm_start_from) trainer.train_with_eval() keys = [("label", "probabilities")] predictions = list(trainer.predict_on_dev(predict_keys=keys)) valid_path_csv = FLAGS.validate_path.replace("..tfrecord", ".csv") df = pd.read_csv(valid_path_csv) labels = df["label"].values community = os.path.basename(FLAGS.validate_path).split("..")[0] assert len(labels) == len(predictions), \ "Labels and predictions must have the same length." d = { "label": labels, "prediction": [p[keys[0]][1] for p in predictions], "community": [community for p in predictions], } df = pd.DataFrame(data=d) df.to_csv(path_or_buf=FLAGS.tmp_results_path, mode='a+', index=False, header=False)
def test_TFRecordInput_rounded(self): FLAGS.labels = 'label' FLAGS.round_labels = True dataset_input = tfrecord_input.TFRecordInput() with self.test_session(): features, labels = dataset_input._read_tf_example(self.ex_tensor) self.assertEqual(features[base_model.TEXT_FEATURE_KEY].eval(), b'Hi there Bob') np.testing.assert_almost_equal(labels['label'].eval(), 1.0) np.testing.assert_almost_equal(features['label_weight'].eval(), 1.0)
def test_TFRecordInput_rounded(self): dataset_input = tfrecord_input.TFRecordInput( train_path=None, validate_path=None, text_feature="comment", labels={"label": tf.float32}, feature_preprocessor=self.preprocessor, round_labels=True) with self.test_session(): features, labels = dataset_input._read_tf_example(self.ex_tensor) self.assertEqual(list(features["comment"].eval()), [12, 13, 999]) self.assertEqual(labels["label"].eval(), 1.0)
def main(argv): del argv # unused dataset = tfrecord_input.TFRecordInput() model = tf_hub_classifier.TFHubClassifierModel(dataset.labels()) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval() serving_input_fn = serving_input.create_text_serving_input_fn( text_feature_name=base_model.TEXT_FEATURE_KEY, example_key_name=base_model.EXAMPLE_KEY) trainer.export(serving_input_fn, base_model.EXAMPLE_KEY)
def test_TFRecordInput_default_values(self): dataset_input = tfrecord_input.TFRecordInput( train_path=None, validate_path=None, text_feature="comment", labels={"label": tf.float32, "fake_label": tf.float32}, feature_preprocessor_init=None, round_labels=False) with self.test_session(): features, labels = dataset_input._read_tf_example(self.ex_tensor, self.preprocessor) self.assertEqual(list(features["comment"].eval()), [12, 13, 999]) self.assertAlmostEqual(labels["label"].eval(), 0.8) self.assertAlmostEqual(labels["fake_label"].eval(), -1.0)
def test_TFRecordInput_default_values(self): FLAGS.labels = 'label,fake_label,int_label' FLAGS.label_dtypes = 'float,float,int' FLAGS.round_labels = False dataset_input = tfrecord_input.TFRecordInput() with self.test_session(): features, labels = dataset_input._read_tf_example(self.ex_tensor) self.assertEqual(features[base_model.TEXT_FEATURE_KEY].eval(), b'Hi there Bob') np.testing.assert_almost_equal(labels['label'].eval(), 0.8) np.testing.assert_almost_equal(labels['int_label'].eval(), 0.0) np.testing.assert_almost_equal(features['label_weight'].eval(), 1.0) np.testing.assert_almost_equal(labels['fake_label'].eval(), 0.0) np.testing.assert_almost_equal( features['fake_label_weight'].eval(), 0.0)
def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path text_feature_name = FLAGS.text_feature_name preprocessor = text_preprocessor.TextPreprocessor(embeddings_path) nltk.download("punkt") tokenize_op = preprocessor.tokenize_tensor_op(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInput(train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=text_feature_name, labels=LABELS, feature_preprocessor=tokenize_op, batch_size=FLAGS.batch_size) model = preprocessor.add_embedding_to_model( tf_gru_attention.TFRNNModel(text_feature_name, set(LABELS.keys())), text_feature_name) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps)