Beispiel #1
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)

    nltk.download("punkt")
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInputWithTokenizer(
        train_preprocess_fn=train_preprocess_fn)

    # TODO: Move embedding *into* Keras model.
    model_tf = tf_gru_attention_multiclass.TFRNNModel(dataset.labels())
    model = preprocessor.add_embedding_to_model(model_tf,
                                                base_model.TOKENS_FEATURE_KEY)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval()

    serving_input_fn = serving_input.create_serving_input_fn(
        word_to_idx=preprocessor._word_to_idx,
        unknown_token=preprocessor._unknown_token,
        text_feature_name=base_model.TOKENS_FEATURE_KEY,
        example_key_name=base_model.EXAMPLE_KEY)
    trainer.export(serving_input_fn, base_model.EXAMPLE_KEY)
Beispiel #2
0
    def test_TFRecordInputWithTokenizer_rounded(self):
        FLAGS.labels = 'label'
        FLAGS.round_labels = True
        dataset_input = tfrecord_input.TFRecordInputWithTokenizer(
            train_preprocess_fn=self.preprocessor)

        with self.test_session():
            features, labels = dataset_input._read_tf_example(self.ex_tensor)
            self.assertEqual(
                list(features[base_model.TOKENS_FEATURE_KEY].eval()),
                [12, 13, 999])
            self.assertEqual(labels['label'].eval(), 1.0)
            self.assertEqual(features['label_weight'].eval(), 1.0)
Beispiel #3
0
def main(argv):
  del argv  # unused

  preprocessor = text_preprocessor.TextPreprocessor(FLAGS.embeddings_path)

  nltk.download("punkt")
  train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
  dataset = tfrecord_input.TFRecordInputWithTokenizer(
      train_preprocess_fn=train_preprocess_fn)

  # TODO: Move embedding *into* Keras model.
  model = preprocessor.add_embedding_to_model(
      keras_cnn.KerasCNNModel(dataset.labels()), base_model.TOKENS_FEATURE_KEY)

  trainer = model_trainer.ModelTrainer(dataset, model)
  trainer.train_with_eval()
Beispiel #4
0
def main(argv):
    del argv  # unused

    preprocessor = text_preprocessor.TextPreprocessor(FLAGS.embeddings_path)

    nltk.download('punkt')
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInputWithTokenizer(
        train_preprocess_fn=train_preprocess_fn, max_seq_len=5000)

    model_tf = tf_word_label_embedding.TFWordLabelEmbeddingModel(
        dataset.labels())
    model = preprocessor.add_embedding_to_model(model_tf,
                                                base_model.TOKENS_FEATURE_KEY)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval()
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)

    nltk.download("punkt")
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInputWithTokenizer(
        train_preprocess_fn=train_preprocess_fn)

    # TODO: Move embedding *into* Keras model.
    model_tf = tf_gru_attention.TFRNNModel(dataset.labels())
    model = preprocessor.add_embedding_to_model(model_tf,
                                                base_model.TOKENS_FEATURE_KEY)

    trainer = model_trainer.ModelTrainer(dataset,
                                         model,
                                         warm_start_from=FLAGS.warm_start_from)
    trainer.train_with_eval()

    key = ('label', 'logistic')
    predictions = list(trainer.evaluate_on_dev(predict_keys=[key]))

    valid_path_csv = FLAGS.validate_path.replace("..tfrecord", ".csv")
    df = pd.read_csv(valid_path_csv)
    labels = df['label'].values

    community = os.path.basename(FLAGS.validate_path).split("..")[0]

    assert len(labels) == len(
        predictions), "Labels and predictions must have the same length."

    d = {
        "label": labels,
        "prediction": [p[key][0] for p in predictions],
        "community": [community for p in predictions],
    }

    df = pd.DataFrame(data=d)
    df.to_csv(path_or_buf=FLAGS.tmp_results_path,
              mode='a+',
              index=False,
              header=False)