def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path preprocessor = text_preprocessor.TextPreprocessor(embeddings_path) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInputWithTokenizer( train_preprocess_fn=train_preprocess_fn) # TODO: Move embedding *into* Keras model. model_tf = tf_gru_attention_multiclass.TFRNNModel(dataset.labels()) model = preprocessor.add_embedding_to_model(model_tf, base_model.TOKENS_FEATURE_KEY) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval() serving_input_fn = serving_input.create_serving_input_fn( word_to_idx=preprocessor._word_to_idx, unknown_token=preprocessor._unknown_token, text_feature_name=base_model.TOKENS_FEATURE_KEY, example_key_name=base_model.EXAMPLE_KEY) trainer.export(serving_input_fn, base_model.EXAMPLE_KEY)
def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path is_binary_embedding = FLAGS.is_binary_embedding text_feature_name = FLAGS.text_feature_name preprocessor = text_preprocessor.TextPreprocessor(embeddings_path, is_binary_embedding) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInput( train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=text_feature_name, labels=LABELS, train_preprocess_fn=train_preprocess_fn, batch_size=FLAGS.batch_size) # TODO: Move embedding *into* Keras model. model = preprocessor.add_embedding_to_model( keras_cnn.KerasCNNModel(set(LABELS.keys())), text_feature_name) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps)
def main(argv): del argv # unused dataset = tfrecord_input.TFRecordInput() model = tf_hub_classifier.TFHubClassifierModel(dataset.labels()) trainer = model_trainer.ModelTrainer(dataset, model, warm_start_from=FLAGS.warm_start_from) trainer.train_with_eval() keys = [("label", "probabilities")] predictions = list(trainer.predict_on_dev(predict_keys=keys)) valid_path_csv = FLAGS.validate_path.replace("..tfrecord", ".csv") df = pd.read_csv(valid_path_csv) labels = df["label"].values community = os.path.basename(FLAGS.validate_path).split("..")[0] assert len(labels) == len(predictions), \ "Labels and predictions must have the same length." d = { "label": labels, "prediction": [p[keys[0]][1] for p in predictions], "community": [community for p in predictions], } df = pd.DataFrame(data=d) df.to_csv(path_or_buf=FLAGS.tmp_results_path, mode='a+', index=False, header=False)
def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path text_feature_name = FLAGS.text_feature_name key_name = FLAGS.key_name preprocessor = text_preprocessor.TextPreprocessor(embeddings_path) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInput( train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=text_feature_name, labels=LABELS, train_preprocess_fn=train_preprocess_fn, batch_size=FLAGS.batch_size) # TODO: Move embedding *into* Keras model. model_keras = keras_gru_attention.KerasRNNModel( set(LABELS.keys()), preprocessor._embedding_size) model = preprocessor.add_embedding_to_model( model_keras, text_feature_name) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps) serving_input_fn = serving_input.create_serving_input_fn( word_to_idx=preprocessor._word_to_idx, unknown_token=preprocessor._unknown_token, text_feature_name=text_feature_name, key_name=key_name) trainer.export(serving_input_fn)
def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path is_binary_embedding = FLAGS.is_binary_embedding text_feature_name = FLAGS.text_feature_name key_name = FLAGS.key_name preprocessor = text_preprocessor.TextPreprocessor(embeddings_path, is_binary_embedding) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInput( train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=text_feature_name, labels=LABELS, train_preprocess_fn=train_preprocess_fn, batch_size=FLAGS.batch_size, max_seq_len=5000) model_tf = tf_word_label_embedding.TFWordLabelEmbeddingModel( text_feature_name, "frac_neg") model = preprocessor.add_embedding_to_model(model_tf, text_feature_name) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps)
def main(argv): del argv # unused dataset = tfrecord_input.TFRecordInput() model = tf_hub_classifier.TFHubClassifierModel(dataset.labels()) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval() serving_input_fn = serving_input.create_text_serving_input_fn( text_feature_name=base_model.TEXT_FEATURE_KEY, example_key_name=base_model.EXAMPLE_KEY) trainer.export(serving_input_fn, base_model.EXAMPLE_KEY)
def main(argv): del argv # unused preprocessor = text_preprocessor.TextPreprocessor(FLAGS.embeddings_path) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInputWithTokenizer( train_preprocess_fn=train_preprocess_fn) # TODO: Move embedding *into* Keras model. model = preprocessor.add_embedding_to_model( keras_cnn.KerasCNNModel(dataset.labels()), base_model.TOKENS_FEATURE_KEY) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval()
def main(argv): del argv # unused preprocessor = text_preprocessor.TextPreprocessor(FLAGS.embeddings_path) nltk.download('punkt') train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInputWithTokenizer( train_preprocess_fn=train_preprocess_fn, max_seq_len=5000) model_tf = tf_word_label_embedding.TFWordLabelEmbeddingModel( dataset.labels()) model = preprocessor.add_embedding_to_model(model_tf, base_model.TOKENS_FEATURE_KEY) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval()
def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path preprocessor = text_preprocessor.TextPreprocessor(embeddings_path) nltk.download("punkt") train_preprocess_fn = preprocessor.train_preprocess_fn(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInputWithTokenizer( train_preprocess_fn=train_preprocess_fn) # TODO: Move embedding *into* Keras model. model_tf = tf_gru_attention.TFRNNModel(dataset.labels()) model = preprocessor.add_embedding_to_model(model_tf, base_model.TOKENS_FEATURE_KEY) trainer = model_trainer.ModelTrainer(dataset, model, warm_start_from=FLAGS.warm_start_from) trainer.train_with_eval() key = ('label', 'logistic') predictions = list(trainer.evaluate_on_dev(predict_keys=[key])) valid_path_csv = FLAGS.validate_path.replace("..tfrecord", ".csv") df = pd.read_csv(valid_path_csv) labels = df['label'].values community = os.path.basename(FLAGS.validate_path).split("..")[0] assert len(labels) == len( predictions), "Labels and predictions must have the same length." d = { "label": labels, "prediction": [p[key][0] for p in predictions], "community": [community for p in predictions], } df = pd.DataFrame(data=d) df.to_csv(path_or_buf=FLAGS.tmp_results_path, mode='a+', index=False, header=False)
def main(argv): del argv # unused dataset = tfrecord_simple.TFSimpleRecordInput( train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=FLAGS.text_feature_name, labels=LABELS, batch_size=FLAGS.batch_size) model = tf_hub_classifier.TFHubClassifierModel(FLAGS.text_feature_name, set(LABELS.keys())) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps) serving_input_fn = create_serving_input_fn( text_feature_name=FLAGS.text_feature_name, key_name=FLAGS.key_name) trainer.export(serving_input_fn)
def main(argv): del argv # unused module = hub.Module(FLAGS.model_spec) with tf.Session() as sess: spm_path = sess.run(module(signature='spm_path')) dataset = TFRecordWithSentencePiece(spm_path) model = tf_hub_classifier.TFHubClassifierModel(dataset.labels()) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval() values = tf.placeholder(tf.int64, shape=[None], name='values') indices = tf.placeholder(tf.int64, shape=[None, 2], name='indices') dense_shape = tf.placeholder(tf.int64, shape=[None], name='dense_shape') serving_input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn( { 'values': values, 'indices': indices, 'dense_shape': dense_shape }) trainer.export(serving_input_fn, None)
def main(argv): del argv # unused embeddings_path = FLAGS.embeddings_path text_feature_name = FLAGS.text_feature_name preprocessor = text_preprocessor.TextPreprocessor(embeddings_path) nltk.download("punkt") tokenize_op = preprocessor.tokenize_tensor_op(nltk.word_tokenize) dataset = tfrecord_input.TFRecordInput(train_path=FLAGS.train_path, validate_path=FLAGS.validate_path, text_feature=text_feature_name, labels=LABELS, feature_preprocessor=tokenize_op, batch_size=FLAGS.batch_size) model = preprocessor.add_embedding_to_model( tf_gru_attention.TFRNNModel(text_feature_name, set(LABELS.keys())), text_feature_name) trainer = model_trainer.ModelTrainer(dataset, model) trainer.train_with_eval(FLAGS.train_steps, FLAGS.eval_period, FLAGS.eval_steps)