Esempio n. 1
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path
    is_binary_embedding = FLAGS.is_binary_embedding
    text_feature_name = FLAGS.text_feature_name

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path,
                                                      is_binary_embedding)

    nltk.download("punkt")
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInput(
        train_path=FLAGS.train_path,
        validate_path=FLAGS.validate_path,
        text_feature=text_feature_name,
        labels=LABELS,
        train_preprocess_fn=train_preprocess_fn,
        batch_size=FLAGS.batch_size)

    # TODO: Move embedding *into* Keras model.
    model = preprocessor.add_embedding_to_model(
        keras_cnn.KerasCNNModel(set(LABELS.keys())), text_feature_name)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period,
                            FLAGS.eval_steps)
Esempio n. 2
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)

    nltk.download("punkt")
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInputWithTokenizer(
        train_preprocess_fn=train_preprocess_fn)

    # TODO: Move embedding *into* Keras model.
    model_tf = tf_gru_attention_multiclass.TFRNNModel(dataset.labels())
    model = preprocessor.add_embedding_to_model(model_tf,
                                                base_model.TOKENS_FEATURE_KEY)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval()

    serving_input_fn = serving_input.create_serving_input_fn(
        word_to_idx=preprocessor._word_to_idx,
        unknown_token=preprocessor._unknown_token,
        text_feature_name=base_model.TOKENS_FEATURE_KEY,
        example_key_name=base_model.EXAMPLE_KEY)
    trainer.export(serving_input_fn, base_model.EXAMPLE_KEY)
Esempio n. 3
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path
    is_binary_embedding = FLAGS.is_binary_embedding
    text_feature_name = FLAGS.text_feature_name
    key_name = FLAGS.key_name

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path,
                                                      is_binary_embedding)

    nltk.download("punkt")
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInput(
        train_path=FLAGS.train_path,
        validate_path=FLAGS.validate_path,
        text_feature=text_feature_name,
        labels=LABELS,
        train_preprocess_fn=train_preprocess_fn,
        batch_size=FLAGS.batch_size,
        max_seq_len=5000)

    model_tf = tf_word_label_embedding.TFWordLabelEmbeddingModel(
        text_feature_name, "frac_neg")
    model = preprocessor.add_embedding_to_model(model_tf, text_feature_name)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period,
                            FLAGS.eval_steps)
Esempio n. 4
0
def main(argv):
  del argv  # unused

  embeddings_path = FLAGS.embeddings_path
  text_feature_name = FLAGS.text_feature_name
  key_name = FLAGS.key_name

  preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)

  nltk.download("punkt")
  train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
  dataset = tfrecord_input.TFRecordInput(
      train_path=FLAGS.train_path,
      validate_path=FLAGS.validate_path,
      text_feature=text_feature_name,
      labels=LABELS,
      train_preprocess_fn=train_preprocess_fn,
      batch_size=FLAGS.batch_size)

  # TODO: Move embedding *into* Keras model.
  model_keras = keras_gru_attention.KerasRNNModel(
    set(LABELS.keys()),
    preprocessor._embedding_size)
  model = preprocessor.add_embedding_to_model(
      model_keras, text_feature_name)

  trainer = model_trainer.ModelTrainer(dataset, model)
  trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps)

  serving_input_fn = serving_input.create_serving_input_fn(
      word_to_idx=preprocessor._word_to_idx,
      unknown_token=preprocessor._unknown_token,
      text_feature_name=text_feature_name,
      key_name=key_name)
  trainer.export(serving_input_fn)
 def test_Lowercase(self):
   preprocessor = text_preprocessor.TextPreprocessor(
       'testdata/cats_and_dogs_onehot.vocab.txt')
   with self.test_session() as session:
     preprocess_fn = preprocessor.train_preprocess_fn(
         tokenizer=lambda x: x.split(' '), lowercase=True)
     tokens = preprocess_fn('Dogs GOOD Cats BAD rabbits not')
     self.assertEqual(list(tokens.eval()), [1, 3, 2, 4, 7, 6])
Esempio n. 6
0
def main(argv):
  del argv  # unused

  preprocessor = text_preprocessor.TextPreprocessor(FLAGS.embeddings_path)

  nltk.download("punkt")
  train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
  dataset = tfrecord_input.TFRecordInputWithTokenizer(
      train_preprocess_fn=train_preprocess_fn)

  # TODO: Move embedding *into* Keras model.
  model = preprocessor.add_embedding_to_model(
      keras_cnn.KerasCNNModel(dataset.labels()), base_model.TOKENS_FEATURE_KEY)

  trainer = model_trainer.ModelTrainer(dataset, model)
  trainer.train_with_eval()
Esempio n. 7
0
def main(argv):
    del argv  # unused

    preprocessor = text_preprocessor.TextPreprocessor(FLAGS.embeddings_path)

    nltk.download('punkt')
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInputWithTokenizer(
        train_preprocess_fn=train_preprocess_fn, max_seq_len=5000)

    model_tf = tf_word_label_embedding.TFWordLabelEmbeddingModel(
        dataset.labels())
    model = preprocessor.add_embedding_to_model(model_tf,
                                                base_model.TOKENS_FEATURE_KEY)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval()
Esempio n. 8
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)

    nltk.download("punkt")
    train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize)
    dataset = tfrecord_input.TFRecordInputWithTokenizer(
        train_preprocess_fn=train_preprocess_fn)

    # TODO: Move embedding *into* Keras model.
    model_tf = tf_gru_attention.TFRNNModel(dataset.labels())
    model = preprocessor.add_embedding_to_model(model_tf,
                                                base_model.TOKENS_FEATURE_KEY)

    trainer = model_trainer.ModelTrainer(dataset,
                                         model,
                                         warm_start_from=FLAGS.warm_start_from)
    trainer.train_with_eval()

    key = ('label', 'logistic')
    predictions = list(trainer.evaluate_on_dev(predict_keys=[key]))

    valid_path_csv = FLAGS.validate_path.replace("..tfrecord", ".csv")
    df = pd.read_csv(valid_path_csv)
    labels = df['label'].values

    community = os.path.basename(FLAGS.validate_path).split("..")[0]

    assert len(labels) == len(
        predictions), "Labels and predictions must have the same length."

    d = {
        "label": labels,
        "prediction": [p[key][0] for p in predictions],
        "community": [community for p in predictions],
    }

    df = pd.DataFrame(data=d)
    df.to_csv(path_or_buf=FLAGS.tmp_results_path,
              mode='a+',
              index=False,
              header=False)
Esempio n. 9
0
def main(argv):
    del argv  # unused

    embeddings_path = FLAGS.embeddings_path
    text_feature_name = FLAGS.text_feature_name

    preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)
    nltk.download("punkt")
    tokenize_op = preprocessor.tokenize_tensor_op(nltk.word_tokenize)

    dataset = tfrecord_input.TFRecordInput(train_path=FLAGS.train_path,
                                           validate_path=FLAGS.validate_path,
                                           text_feature=text_feature_name,
                                           labels=LABELS,
                                           feature_preprocessor=tokenize_op,
                                           batch_size=FLAGS.batch_size)

    model = preprocessor.add_embedding_to_model(
        tf_gru_attention.TFRNNModel(text_feature_name, set(LABELS.keys())),
        text_feature_name)

    trainer = model_trainer.ModelTrainer(dataset, model)
    trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period,
                            FLAGS.eval_steps)