Esempio n. 1
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path
    is_binary_embedding = FLAGS.is_binary_embedding
    text_feature_name = FLAGS.text_feature_name

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path,
                                                      is_binary_embedding)

    nltk.download("punkt")
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInput(
        train_path=FLAGS.train_path,
        validate_path=FLAGS.validate_path,
        text_feature=text_feature_name,
        labels=LABELS,
        train_preprocess_fn=train_preprocess_fn,
        batch_size=FLAGS.batch_size)

    # TODO: Move embedding *into* Keras model.
    model = preprocessor.add_embedding_to_model(
        keras_cnn.KerasCNNModel(set(LABELS.keys())), text_feature_name)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period,
                            FLAGS.eval_steps)
Esempio n. 2
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path
    is_binary_embedding = FLAGS.is_binary_embedding
    text_feature_name = FLAGS.text_feature_name
    key_name = FLAGS.key_name

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path,
                                                      is_binary_embedding)

    nltk.download("punkt")
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInput(
        train_path=FLAGS.train_path,
        validate_path=FLAGS.validate_path,
        text_feature=text_feature_name,
        labels=LABELS,
        train_preprocess_fn=train_preprocess_fn,
        batch_size=FLAGS.batch_size,
        max_seq_len=5000)

    model_tf = tf_word_label_embedding.TFWordLabelEmbeddingModel(
        text_feature_name, "frac_neg")
    model = preprocessor.add_embedding_to_model(model_tf, text_feature_name)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period,
                            FLAGS.eval_steps)
Esempio n. 3
0
def main(argv):
  del argv  # unused

  embeddings_path = FLAGS.embeddings_path
  text_feature_name = FLAGS.text_feature_name
  key_name = FLAGS.key_name

  preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)

  nltk.download("punkt")
  train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
  dataset = tfrecord_input.TFRecordInput(
      train_path=FLAGS.train_path,
      validate_path=FLAGS.validate_path,
      text_feature=text_feature_name,
      labels=LABELS,
      train_preprocess_fn=train_preprocess_fn,
      batch_size=FLAGS.batch_size)

  # TODO: Move embedding *into* Keras model.
  model_keras = keras_gru_attention.KerasRNNModel(
    set(LABELS.keys()),
    preprocessor._embedding_size)
  model = preprocessor.add_embedding_to_model(
      model_keras, text_feature_name)

  trainer = model_trainer.ModelTrainer(dataset, model)
  trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps)

  serving_input_fn = serving_input.create_serving_input_fn(
      word_to_idx=preprocessor._word_to_idx,
      unknown_token=preprocessor._unknown_token,
      text_feature_name=text_feature_name,
      key_name=key_name)
  trainer.export(serving_input_fn)
def main(argv):
    del argv  # unused

    dataset = tfrecord_input.TFRecordInput()
    model = tf_hub_classifier.TFHubClassifierModel(dataset.labels())

    trainer = model_trainer.ModelTrainer(dataset,
                                         model,
                                         warm_start_from=FLAGS.warm_start_from)
    trainer.train_with_eval()

    keys = [("label", "probabilities")]
    predictions = list(trainer.predict_on_dev(predict_keys=keys))

    valid_path_csv = FLAGS.validate_path.replace("..tfrecord", ".csv")
    df = pd.read_csv(valid_path_csv)
    labels = df["label"].values
    community = os.path.basename(FLAGS.validate_path).split("..")[0]

    assert len(labels) == len(predictions), \
      "Labels and predictions must have the same length."

    d = {
        "label": labels,
        "prediction": [p[keys[0]][1] for p in predictions],
        "community": [community for p in predictions],
    }

    df = pd.DataFrame(data=d)
    df.to_csv(path_or_buf=FLAGS.tmp_results_path,
              mode='a+',
              index=False,
              header=False)
Esempio n. 5
0
    def test_TFRecordInput_rounded(self):
        FLAGS.labels = 'label'
        FLAGS.round_labels = True
        dataset_input = tfrecord_input.TFRecordInput()

        with self.test_session():
            features, labels = dataset_input._read_tf_example(self.ex_tensor)
            self.assertEqual(features[base_model.TEXT_FEATURE_KEY].eval(),
                             b'Hi there Bob')
            np.testing.assert_almost_equal(labels['label'].eval(), 1.0)
            np.testing.assert_almost_equal(features['label_weight'].eval(),
                                           1.0)
Esempio n. 6
0
    def test_TFRecordInput_rounded(self):
        dataset_input = tfrecord_input.TFRecordInput(
            train_path=None,
            validate_path=None,
            text_feature="comment",
            labels={"label": tf.float32},
            feature_preprocessor=self.preprocessor,
            round_labels=True)

        with self.test_session():
            features, labels = dataset_input._read_tf_example(self.ex_tensor)
            self.assertEqual(list(features["comment"].eval()), [12, 13, 999])
            self.assertEqual(labels["label"].eval(), 1.0)
Esempio n. 7
0
def main(argv):
    del argv  # unused

    dataset = tfrecord_input.TFRecordInput()
    model = tf_hub_classifier.TFHubClassifierModel(dataset.labels())

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval()

    serving_input_fn = serving_input.create_text_serving_input_fn(
        text_feature_name=base_model.TEXT_FEATURE_KEY,
        example_key_name=base_model.EXAMPLE_KEY)
    trainer.export(serving_input_fn, base_model.EXAMPLE_KEY)
Esempio n. 8
0
  def test_TFRecordInput_default_values(self):
    dataset_input = tfrecord_input.TFRecordInput(
        train_path=None,
        validate_path=None,
        text_feature="comment",
        labels={"label": tf.float32, "fake_label": tf.float32},
        feature_preprocessor_init=None,
        round_labels=False)

    with self.test_session():
      features, labels = dataset_input._read_tf_example(self.ex_tensor, self.preprocessor)
      self.assertEqual(list(features["comment"].eval()), [12, 13, 999])
      self.assertAlmostEqual(labels["label"].eval(), 0.8)
      self.assertAlmostEqual(labels["fake_label"].eval(), -1.0)
Esempio n. 9
0
    def test_TFRecordInput_default_values(self):
        FLAGS.labels = 'label,fake_label,int_label'
        FLAGS.label_dtypes = 'float,float,int'
        FLAGS.round_labels = False
        dataset_input = tfrecord_input.TFRecordInput()

        with self.test_session():
            features, labels = dataset_input._read_tf_example(self.ex_tensor)
            self.assertEqual(features[base_model.TEXT_FEATURE_KEY].eval(),
                             b'Hi there Bob')
            np.testing.assert_almost_equal(labels['label'].eval(), 0.8)
            np.testing.assert_almost_equal(labels['int_label'].eval(), 0.0)
            np.testing.assert_almost_equal(features['label_weight'].eval(),
                                           1.0)
            np.testing.assert_almost_equal(labels['fake_label'].eval(), 0.0)
            np.testing.assert_almost_equal(
                features['fake_label_weight'].eval(), 0.0)
Esempio n. 10
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path
    text_feature_name = FLAGS.text_feature_name

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)
    nltk.download("punkt")
    tokenize_op = preprocessor.tokenize_tensor_op(nltk.word_tokenize)

    dataset = tfrecord_input.TFRecordInput(train_path=FLAGS.train_path,
                                           validate_path=FLAGS.validate_path,
                                           text_feature=text_feature_name,
                                           labels=LABELS,
                                           feature_preprocessor=tokenize_op,
                                           batch_size=FLAGS.batch_size)

    model = preprocessor.add_embedding_to_model(
        tf_gru_attention.TFRNNModel(text_feature_name, set(LABELS.keys())),
        text_feature_name)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period,
                            FLAGS.eval_steps)