def testConfig(self): module_path = os.path.join(self.get_temp_dir(), "module") export_module_spec(self.spec, module_path) text_column = hub.text_embedding_column("text", module_path) config = text_column.get_config() cloned_text_column = hub.feature_column._TextEmbeddingColumn.from_config( config) self.assertEqual(cloned_text_column.module_spec_path, text_column.module_spec_path) with self.assertRaisesRegexp(NotImplementedError, "Can only generate"): text_column = hub.text_embedding_column("text", self.spec) config = text_column.get_config()
def testInputLayer(self): features = { "text_a": ["hello world", "pair-programming"], "text_b": ["hello world", "oov token"], } feature_columns = [ hub.text_embedding_column("text_a", self.spec, trainable=False), hub.text_embedding_column("text_b", self.spec, trainable=False), ] with tf.Graph().as_default(): input_layer = tf.feature_column.input_layer(features, feature_columns) with tf.train.MonitoredSession() as sess: output = sess.run(input_layer) self.assertAllEqual(output, [[1, 2, 3, 4, 1, 2, 3, 4], [5, 5, 5, 5, 0, 0, 0, 0]])
def build_model(directory="train_data.csv"): train_df, test_df = load_directory_data_(directory) # Training input on the whole training set with no limit on training epochs. train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["label"], num_epochs=None, shuffle=True) # Prediction on the whole training set. predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["label"], shuffle=False) # Prediction on the test set. predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["label"], shuffle=False) tf.logging.info("loading embeddings..") embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") tf.logging.info("finished loading embeddings...") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 500], feature_columns=[embedded_text_feature_column], n_classes=5, optimizer=tf.train.AdamOptimizer(learning_rate=0.003), model_dir=MODE_DIR) estimator.train(input_fn=train_input_fn, steps=250000) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) tf.logging.info("Training set accuracy: {accuracy}".format(**train_eval_result)) tf.logging.info("Test set accuracy: {accuracy}".format(**test_eval_result))
def train_and_evaluate_with_module(hub_module, train_input_fn, predict_train_input_fn, predict_dev_input_fn, train_module=False): embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec=hub_module, trainable=train_module) estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=N_CLASSES, optimizer=tf.train.AdagradOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) dev_eval_result = estimator.evaluate(input_fn=predict_dev_input_fn) training_set_accuracy = train_eval_result["accuracy"] dev_set_accuracy = dev_eval_result["accuracy"] return { "training accuracy": training_set_accuracy, "development accuracy": dev_set_accuracy }
def testDenseFeatures_shareAcrossApplication(self): features = { "text": ["hello world", "pair-programming"], } feature_columns = [ hub.text_embedding_column("text", self.spec, trainable=True), ] if not feature_column_v2.is_feature_column_v2(feature_columns): self.skipTest("Resources not implemented in the state manager of feature " "column v2.") with tf.Graph().as_default(): # We want to test with dense_features_v2.DenseFeatures. This symbol was # added in https://github.com/tensorflow/tensorflow/commit/64586f18724f737393071125a91b19adf013cf8a. feature_layer = tf.compat.v2.keras.layers.DenseFeatures(feature_columns) feature_layer_out_1 = feature_layer(features) feature_layer_out_2 = feature_layer(features) # We define loss only on the first layer. Since layers should have shared # weights, we expect the second layer will change too. loss = feature_layer_out_1 - tf.constant(0.005) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.7) train_op = optimizer.minimize(loss) with tf.compat.v1.train.MonitoredSession() as sess: before_update_1 = sess.run(feature_layer_out_1) sess.run(train_op) after_update_1 = sess.run(feature_layer_out_1) after_update_2 = sess.run(feature_layer_out_2) self.assertAllEqual(before_update_1, [[1, 2, 3, 4], [5, 5, 5, 5]]) self.assertAllEqual(after_update_1, after_update_2)
def testTrainableEmbeddingColumn(self): feature_columns = [ hub.text_embedding_column("text", self.spec, trainable=True), ] with tf.Graph().as_default(): features = { "text": ["hello world", "pair-programming"], } target = [[1, 1, 1, 1], [4, 3, 2, 1]] input_layer = tf_v1.feature_column.input_layer( features, feature_columns) loss = tf.cast( tf_v1.losses.mean_squared_error(input_layer, target), tf.float64) optimizer = tf_v1.train.GradientDescentOptimizer( learning_rate=0.97) train_op = optimizer.minimize(loss) with tf_v1.train.MonitoredSession() as sess: self.assertAllEqual(sess.run(input_layer), [[1, 2, 3, 4], [5, 5, 5, 5]]) for _ in range(10): sess.run(train_op) self.assertAllClose(sess.run(input_layer), target, atol=0.5)
def train_classifier_idealist(X_train, path=None): train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn( X_train, X_train["Spam"], num_epochs=None, shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key="DESCRIPTION", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.compat.v1.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, model_dir=path, optimizer=tf.compat.v1.train.AdagradOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=100) if not path is None: try: serving_input_fn = tf.compat.v1.estimator.export.build_parsing_serving_input_receiver_fn( tf.feature_column.make_parse_example_spec( [embedded_text_feature_column])) #export_path = estimator.export_saved_model(path, serving_input_fn) export_path = estimator.latest_checkpoint() with open((path + "/loadPath.txt"), "wb") as fp: # Pickling pickle.dump(export_path, fp) except: print("Could not save USE for ", path) return estimator
def testDenseFeatures_shareAcrossApplication(self): features = { "text": ["hello world", "pair-programming"], } feature_columns = [ hub.text_embedding_column("text", self.spec, trainable=True), ] if not feature_column_v2.is_feature_column_v2(feature_columns): self.skipTest("Resources not implemented in the state manager of feature " "column v2.") with tf.Graph().as_default(): feature_layer = _dense_features_module.DenseFeatures(feature_columns) feature_layer_out_1 = feature_layer(features) feature_layer_out_2 = feature_layer(features) # We define loss only on the first layer. Since layers should have shared # weights, we expect the second layer will change too. loss = feature_layer_out_1 - tf.constant(0.005) optimizer = tf_v1.train.GradientDescentOptimizer(learning_rate=0.7) train_op = optimizer.minimize(loss) with tf_v1.train.MonitoredSession() as sess: before_update_1 = sess.run(feature_layer_out_1) sess.run(train_op) after_update_1 = sess.run(feature_layer_out_1) after_update_2 = sess.run(feature_layer_out_2) self.assertAllEqual(before_update_1, [[1, 2, 3, 4], [5, 5, 5, 5]]) self.assertAllEqual(after_update_1, after_update_2)
def testMakeParseExampleSpec(self): text_column = hub.text_embedding_column("text", self.spec, trainable=False) parsing_spec = tf.feature_column.make_parse_example_spec([text_column]) self.assertEqual(parsing_spec, {"text": tf.FixedLenFeature([1], dtype=tf.string)})
def tensorflowPredict(self, description): descriptions = [] descriptions.append(description) embedded_text_feature_column = hub.text_embedding_column( key="name_and_description", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], model_dir='tensorflowmodel', feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.compat.v1.train.AdagradOptimizer(learning_rate=0.05)) predict_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn({"name_and_description": np.array(descriptions).astype(np.str)}, shuffle=False) results = estimator.predict( predict_input_fn ) # for result in results: # print(result['class_ids'][0]) # top_2 = result['probabilities'].argsort()[-2:][::-1] # for genre in top_2: # print('result ' + ': ' + str(round(result['probabilities'][genre] * 100, 2)) + '%') # print('') # print(result) # predictions = np.array([item['class_ids'][0] for item in results]) predictions = np.array([item ['class_ids'][0]for item in results]) return "Prediction: {}".format(str(predictions))
def testWorksWithCannedEstimator(self): comment_embedding_column = hub.text_embedding_column( "comment", self.spec, trainable=False) upvotes = tf.feature_column.numeric_column("upvotes") feature_columns = [comment_embedding_column, upvotes] estimator = tf.estimator.DNNClassifier( hidden_units=[10], feature_columns=feature_columns, model_dir=self.get_temp_dir()) # This only tests that estimator apis are working with the feature # column without throwing exceptions. features = { "comment": np.array([ ["the quick brown fox"], ["spam spam spam"], ]), "upvotes": np.array([ [20], [1], ]), } labels = np.array([[1], [0]]) input_fn = tf.estimator.inputs.numpy_input_fn( features, labels, shuffle=True) estimator.train(input_fn, max_steps=1) estimator.evaluate(input_fn, steps=1) estimator.predict(input_fn)
def predict(X_test): # ... dataKey = 'Question' labelKey = 'y' full_model_dir = "/home/sbs/Desktop/Dev/ChatBot/EstimatorModels" full_model_dir = sorted(glob.glob(os.path.join(full_model_dir, '*/')), key=os.path.getmtime)[-1] embeded_text_url = "https://tfhub.dev/google/nnlm-en-dim128/1" embedded_text_feature_column = hub.text_embedding_column( key=dataKey, module_spec=embeded_text_url) # ... with tf.Session() as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], full_model_dir) predictor = tf.contrib.predictor.from_saved_model(full_model_dir) # Prediction on the test set. predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( X_test, X_test[labelKey], shuffle=True) model_input = tf.train.Example(features=predict_test_input_fn()) model_input = model_input.SerializeToString() output_dict = predictor({"y": [model_input]}) # # y_predicted = output_dict["pred_output_classes"][0] print output_dict
def train_and_evaluate_with_module(hub_module, train_module=False): """ train and load Training accuracy and Test accuracy parameter: hub_model - check from https://www.tensorflow.org/resources/models-datasets train_model - whether to retrain pretrained model """ embedded_text_feature_column = hub.text_embedding_column( key="text", module_spec=hub_module, trainable=train_module) estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdagradOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) training_set_accuracy = train_eval_result["accuracy"] test_set_accuracy = test_eval_result["accuracy"] return { "Training accuracy": training_set_accuracy, "Test accuracy": test_set_accuracy }
def _model_fn(self, features, labels, mode, params, config): embedded_text_feature_column = hub.text_embedding_column( key=base_model.TEXT_FEATURE_KEY, module_spec=FLAGS.model_spec, trainable=FLAGS.trainable) inputs = tf.feature_column.input_layer(features, [embedded_text_feature_column]) batch_size = tf.shape(inputs)[0] logits = inputs for num_units in params.dense_units: logits = tf.layers.dense(inputs=logits, units=num_units, activation=tf.nn.relu) logits = tf.layers.dropout(logits, rate=params.dropout_rate) logits = tf.layers.dense(inputs=logits, units=len(self._target_labels), activation=None) output_heads = [ tf.contrib.estimator.binary_classification_head( name=name, weight_column=name + '_weight') for name in self._target_labels ] multihead = tf.contrib.estimator.multi_head(output_heads) optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate) return multihead.create_estimator_spec(features=features, labels=labels, mode=mode, logits=logits, optimizer=optimizer)
def object_recognition_classifier(number_of_classes, path_test_dataset, path_model, path_module, reject, path_predicted_labels): test_df = download_and_load_datasets_test(path_test_dataset) # Prediction on the test set. predict_test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], shuffle=False) embedded_text_feature_column = hub.text_embedding_column( key="coordonates", module_spec=path_module) loaded_ckpt = tf.train.load_checkpoint(path_model) estimator_loaded = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=number_of_classes, warm_start_from=path_model) tab_predictions, tab_probabilities = best_classes_and_probabilities( estimator_loaded, predict_test_input_fn) tab_predictions_with_reject = ambiguity_reject(tab_predictions, tab_probabilities, reject, number_of_classes) trgt_predict_txt = open(path_predicted_labels, 'w') trgt_predict_txt.writelines(str(tab_predictions_with_reject)) trgt_predict_txt.close() return
def train_and_evaluate_with_module(hub_module, train_module=False, rate=0.003, steps=1000, hunits=[500, 100], nc=2): start_time = time.time() embedded_text_feature_column = hub.text_embedding_column( key="content", module_spec=hub_module, trainable=train_module) estimator = tf.estimator.DNNClassifier( hidden_units=hunits, feature_columns=[embedded_text_feature_column], n_classes=nc, optimizer=tf.train.AdagradOptimizer(learning_rate=rate)) estimator.train(input_fn=train_input_fn, steps=steps) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) training_set_accuracy = train_eval_result["accuracy"] test_set_accuracy = test_eval_result["accuracy"] return { "Training accuracy": training_set_accuracy, "Test accuracy": test_set_accuracy, "Learning rate": rate, "Steps": steps, "n classes": nc, "Hidden units": hunits, "Time elapsed": round(time.time() - start_time), }
def train_and_evaluate_with_sentence_encoder(hub_module, train_module=False, path=''): embedding_feature = hub.text_embedding_column( key='sentence', module_spec=hub_module, trainable=train_module) print('Training with', hub_module) print('Trainable is:', train_module) dnn = tf.estimator.DNNClassifier( hidden_units=[512, 128], feature_columns=[embedding_feature], n_classes=2, activation_fn=tf.nn.relu, dropout=0.1, optimizer=tf.train.AdagradOptimizer(learning_rate=0.005), model_dir=path, config=my_checkpointing_config) for step in range(0, TOTAL_STEPS+1, STEP_SIZE): print('Training for step =', step) dnn.train(input_fn=train_input_fn, steps=STEP_SIZE) print('Eval Metrics (Train):', dnn.evaluate(input_fn=predict_train_input_fn)) print('Eval Metrics (Validation):', dnn.evaluate(input_fn=predict_val_input_fn)) print('\n') predictions_train = get_predictions(estimator=dnn, input_fn=predict_train_input_fn) predictions_dev = get_predictions(estimator=dnn, input_fn=predict_test_input_fn) return predictions_train, predictions_dev, dnn
def test(self, test, version): disable_gpu() test_utterances = test['utterance'].astype('str') test_intents = test['intent'] test['predict_intent'] = '' test['match'] = 0 encoder = LabelEncoder() encoder.fit_transform(test_intents) test_encoded = encoder.transform(test_intents) num_classes = len(encoder.classes_) embeddings = hub.text_embedding_column( 'utterance', module_spec=MODULE_SPEC[self.embedding], trainable=False) multi_class_head = tf.contrib.estimator.multi_class_head( num_classes, loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE) estimator = tf.contrib.estimator.DNNEstimator( head=multi_class_head, hidden_units=[64, 10], model_dir='models/tf/benchmark/' + self.embedding + '/' + version, feature_columns=[embeddings]) predict_input_fn = tf.estimator.inputs.numpy_input_fn( {"utterance": np.array(test_utterances).astype(np.str)}, shuffle=False) results = estimator.predict(predict_input_fn) index = 0 total = len(test) predict_intent_idx = test.columns.get_loc('predict_intent') match_idx = test.columns.get_loc('match') # Display predictions for result in results: idx = np.argmax(result['probabilities']) intent = encoder.classes_[idx] row = test.iloc[index] test.iat[index, predict_intent_idx] = intent if row['intent'] == intent: test.iat[index, match_idx] = 1 index += 1 printProgress(index, total) # Percentage of correct predictions missed = test[test['match'] == 0] accuracy = 100 * (1 - len(missed) / len(test)) print( 'DNN NLU scores %0.2f%% with %d false predictions in total %d samples' % (accuracy, len(missed), len(test))) save_csv(missed, 'missed/' + version + '/' + self.embedding + '.tf.csv') result = test['match'].value_counts() return result
def classify(train_df: pd.DataFrame, test_df: pd.DataFrame): # Training input on the whole training set with no limit on training epochs. train_input_fn = tf.estimator.inputs.pandas_input_fn(train_df, train_df["class"], num_epochs=50, shuffle=True) # Prediction on the whole training set. predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["class"], shuffle=False) # Prediction on the test set. predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["class"], shuffle=False) print("Download pretrained model") embed = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-de-dim128/1", trainable=True) print("Train estimator") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embed], n_classes=3, dropout=0.6, optimizer=tf.train.AdagradOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn) print("Evaluate estimator") train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) training_set_accuracy = train_eval_result["accuracy"] test_set_accuracy = test_eval_result["accuracy"] print("Training set accuracy: ") print(training_set_accuracy) print("Test set accuracy: ") print(test_set_accuracy) LABELS = ["positive", "negative", "neutral"] # Create a confusion matrix on training data. with tf.Graph().as_default(): cm = tf.confusion_matrix( test_df["class"], get_predictions(estimator, predict_test_input_fn)) with tf.Session() as session: cm_out = session.run(cm) # Normalize the confusion matrix so that each row sums to 1. cm_out = cm_out.astype(float) / cm_out.sum(axis=1)[:, np.newaxis] sns.heatmap(cm_out, annot=True, xticklabels=LABELS, yticklabels=LABELS) plt.xlabel("Predicted") plt.ylabel("True") plt.show()
def testFeatureColumnsWithResources(self, mock_add_resource): feature_column = hub.text_embedding_column("text_a", self.spec) if not isinstance(feature_column, feature_column_v2.FeatureColumn): self.skipTest( "Resources not implemented in the state manager of feature " "column v2.") self.assertTrue( feature_column_v2.is_feature_column_v2([feature_column]))
def main(): tf.logging.set_verbosity(tf.logging.ERROR) t = time.time() train_df, test_df = download_and_load_datasets() train_df.head() print("Data Loaded, time: %.5fs" % (time.time() - t)) # Training input on the whole training set with no limit on training epochs. train_input_fn = tf.estimator.inputs.pandas_input_fn(train_df, train_df["polarity"], batch_size=128, num_epochs=5, shuffle=True) # Prediction on the whole training set. predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], batch_size=128, shuffle=False) # Prediction on the test set. predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], batch_size=128, shuffle=False) embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") print("Sentences embedded, time: %.5fs" % (time.time() - t)) estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdagradOptimizer(learning_rate=0.003)) """### Training Train the estimator for a reasonable amount of steps. """ print("Train") # Reduce logging output. #tf.logging.set_verbosity(tf.logging.INFO) # Training for 1,000 steps means 128,000 training examples with the default # batch size. This is roughly equivalent to 5 epochs since the training dataset # contains 25,000 examples. estimator.train(input_fn=train_input_fn, steps=1000) # Reduce logging output. #tf.logging.set_verbosity(tf.logging.ERROR) print("Training complete, time: %.5fs" % (time.time() - t)) """# Prediction Run predictions for both training and test set. """ train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) print("Training set accuracy: {accuracy}".format(**train_eval_result)) print("Test set accuracy: {accuracy}".format(**test_eval_result))
def testDenseFeatures(self): features = { "text_a": ["hello world", "pair-programming"], "text_b": ["hello world", "oov token"], } feature_columns = [ hub.text_embedding_column("text_a", self.spec, trainable=False), hub.text_embedding_column("text_b", self.spec, trainable=False), ] if not feature_column_v2.is_feature_column_v2(feature_columns): self.skipTest("Resources not implemented in the state manager of feature " "column v2.") with tf.Graph().as_default(): feature_layer = _dense_features_module.DenseFeatures(feature_columns) feature_layer_out = feature_layer(features) with tf_v1.train.MonitoredSession() as sess: output = sess.run(feature_layer_out) self.assertAllEqual( output, [[1, 2, 3, 4, 1, 2, 3, 4], [5, 5, 5, 5, 0, 0, 0, 0]])
def save(estimator, text_embedding_module, export_dir_base): embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec=text_embedding_module) feature_columns = [embedded_text_feature_column] feature_spec = tf.feature_column.make_parse_example_spec(feature_columns) serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( feature_spec) estimator.export_savedmodel( export_dir_base=export_dir_base, serving_input_receiver_fn=serving_input_receiver_fn)
def create(text_embedding_module, model_dir): embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec=text_embedding_module) return tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdagradOptimizer(learning_rate=0.003), model_dir=model_dir)
def fit(self, X, y=None): embedded_text_feature_column = hub.text_embedding_column( key=self.text_feature_key, module_spec=self.tf_hub_module, trainable=self.trainable) # train_input_fn = tf.estimator.inputs.pandas_input_fn(dfs['description'].to_frame(), dfs["label"], batch_size=1200, num_epochs=None, shuffle=False) train_input_fn = tf.estimator.inputs.pandas_input_fn( X, y, batch_size=self.batch_size, num_epochs=None, shuffle=True) optimizer = tf.train.ProximalAdagradOptimizer( learning_rate=self.learning_rate, l1_regularization_strength=self.l1_regularization_strength, l2_regularization_strength=self.l2_regularization_strength) activation = { 'relu': tf.nn.relu, 'tanh': tf.nn.tanh, 'sigmoid': tf.nn.sigmoid, 'elu': tf.nn.elu, 'softplus': tf.nn.softplus, 'softsign': tf.nn.softsign, 'relu6': tf.nn.relu6 } path = ''.join(random.choice('0123456789ABCDEF') for i in range(16)) path = '../' + path self.model_dir = path os.mkdir(self.model_dir) print self.model_dir #run_config = tf.estimator.RunConfig(save_summary_steps=None, save_checkpoints_secs=None) self.estimator = tf.estimator.DNNClassifier( hidden_units=self.hidden_units, feature_columns=[embedded_text_feature_column], n_classes=self.num_classes, optimizer=optimizer, dropout=self.dropout, batch_norm=self.batch_norm, activation_fn=activation[self.activation_fn], #config=run_config, model_dir=self.model_dir) path = ''.join(random.choice('0123456789ABCDEF') for i in range(16)) self.eval_path = './' + path os.mkdir(self.eval_path) print self.eval_path early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook( self.estimator, metric_name='loss', max_steps_without_decrease=400, min_steps=100, eval_dir=self.eval_path) self.estimator.train(input_fn=train_input_fn, steps=self.training_steps) tf.reset_default_graph() return self
def create_feature_columns(hparams): title_embeding_column = hub.text_embedding_column( "title", "https://tfhub.dev/google/universal-sentence-encoder/1") feature_columns = [title_embeding_column] print("feature columns: \n {}".format(feature_columns)) print("") return feature_columns
def run(self, debug=False): init_op = tf.global_variables_initializer() # Reduce logging output. if debug: tf.logging.set_verbosity(tf.logging.ERROR) self.data_train.head() # Training input on the whole training set with no limit on training epochs. train_input_fn = tf.estimator.inputs.pandas_input_fn( x=self.data_train, y=self.data_train[self.labelKey], num_epochs=None, shuffle=True) # Prediction on the whole training set. predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( self.data_train, self.data_train[self.labelKey], shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key=self.dataKey, module_spec=self.embeded_text_url) #print self.data_train #print len(np.unique(self.data_train[self.labelKey])) my_checkpointing_config = tf.estimator.RunConfig( save_checkpoints_secs=10 * 60, # Save checkpoints every 20 minutes. keep_checkpoint_max=10, # Retain the 10 most recent checkpoints. ) estimator = tf.estimator.DNNClassifier( hidden_units=self.hidden_units_size, feature_columns=[embedded_text_feature_column], n_classes=len(np.unique(self.data_train[self.labelKey])), optimizer=tf.train.AdagradOptimizer( learning_rate=self.learning_rate), dropout=0.25) #model_dir=self.export_dir_base, config=my_checkpointing_config) # Training for 1,000 steps means 128,000 training examples with the default # batch size. This is roughly equivalent to 5 epochs since the training dataset # contains 25,000 examples. classifier = estimator.train(input_fn=train_input_fn, steps=100) # Save the training model #print('Exporting trained model to', self.export_dir_base) #classifier.export_savedmodel(export_dir_base=self.export_dir_base, # serving_input_receiver_fn=self.serving_input_receiver_fn, as_text=False) # train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) # test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) # # print "DNN Training set accuracy: {accuracy}".format(**train_eval_result) # print "DNN Test set accuracy: {accuracy}".format(**test_eval_result) return estimator
def build_estimator(config, hidden_units, learning_rate, dropout, optimizer, hub_module, train_hub): hub_column = hub.text_embedding_column(key="sentence", module_spec=hub_module, trainable=train_hub) return tf.estimator.DNNClassifier(config=config, feature_columns=[hub_column], hidden_units=hidden_units, optimizer=get_optimizer( optimizer, learning_rate), dropout=dropout)
def RunClassificationModel(score, module, train_module=False): # We only care about errors tf.logging.set_verbosity(tf.logging.ERROR) # Set up our data frames train_df, test_df, test = load_facebook_data(score) train_df.head() # Train model train_input_fn = tf.estimator.inputs.pandas_input_fn(train_df, train_df[score], num_epochs=None, shuffle=True) predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df[score], shuffle=False) predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(test_df, test_df[score], shuffle=False) # Set up our feature columns from the model embedded_text_feature_column = hub.text_embedding_column( key="status", module_spec=module, trainable=train_module, ) # Set up our neural network estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=6, optimizer=tf.train.AdagradOptimizer(learning_rate=0.003, )) # Train our neural network estimator.train(input_fn=train_input_fn, steps=10) # Get Results train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) test_predictions = estimator.predict(input_fn=predict_test_input_fn) train_accuracy = train_eval_result["accuracy"] * 100. test_accuracy = test_eval_result["accuracy"] * 100. # Print results print "Results" print "================" print "Model: " + module print "Score: " + score print "Train Module: " + str(train_module) print "================" print "Training set accuracy: {0:.2f}%".format(train_accuracy) print "Test set accuracy: {0:.2f}%".format(test_accuracy) print ""
def build_dnn_classifier(self, hidden_vec, lrate): embedded_text_feature_column = tfhub.text_embedding_column( key=self.text_column_label, module_spec=self.encoder_module, trainable=True) return tf.estimator.DNNClassifier( model_dir="models/v1", hidden_units=hidden_vec, feature_columns=[embedded_text_feature_column], n_classes=self.n_classes, optimizer=tf.train.AdagradOptimizer(learning_rate=lrate))
def create_feature_columns(args): # Create content_id feature column content_id_column = tf.feature_column.categorical_column_with_hash_bucket( key = "content_id", hash_bucket_size = number_of_content_ids) # Embed content id into a lower dimensional representation embedded_content_column = tf.feature_column.embedding_column( categorical_column = content_id_column, dimension = args['content_id_embedding_dimensions']) # Create category feature column categorical_category_column = tf.feature_column.categorical_column_with_vocabulary_file( key = "category", vocabulary_file = tf.gfile.Glob(filename = "gs://{}/hybrid_recommendation/preproc/vocabs/category_vocab.txt*".format(args['bucket']))[0], num_oov_buckets = 1) # Convert categorical category column into indicator column so that it can be used in a DNN indicator_category_column = tf.feature_column.indicator_column(categorical_column = categorical_category_column) # Create title feature column using TF Hub embedded_title_column = hub.text_embedding_column( key = "title", module_spec = "https://tfhub.dev/google/nnlm-de-dim50-with-normalization/1", trainable = False) # Create author feature column author_column = tf.feature_column.categorical_column_with_hash_bucket( key = "author", hash_bucket_size = number_of_authors + 1) # Embed author into a lower dimensional representation embedded_author_column = tf.feature_column.embedding_column( categorical_column = author_column, dimension = args['author_embedding_dimensions']) # Create months since epoch boundaries list for our binning months_since_epoch_boundaries = list(range(400, 700, 20)) # Create months_since_epoch feature column using raw data months_since_epoch_column = tf.feature_column.numeric_column( key = "months_since_epoch") # Create bucketized months_since_epoch feature column using our boundaries months_since_epoch_bucketized = tf.feature_column.bucketized_column( source_column = months_since_epoch_column, boundaries = months_since_epoch_boundaries) # Cross our categorical category column and bucketized months since epoch column crossed_months_since_category_column = tf.feature_column.crossed_column( keys = [categorical_category_column, months_since_epoch_bucketized], hash_bucket_size = len(months_since_epoch_boundaries) * (number_of_categories + 1)) # Convert crossed categorical category and bucketized months since epoch column into indicator column so that it can be used in a DNN indicator_crossed_months_since_category_column = tf.feature_column.indicator_column(categorical_column = crossed_months_since_category_column) # Create user and item factor feature columns from our trained WALS model user_factors = [tf.feature_column.numeric_column(key = "user_factor_" + str(i)) for i in range(10)] item_factors = [tf.feature_column.numeric_column(key = "item_factor_" + str(i)) for i in range(10)] # Create list of feature columns feature_columns = [embedded_content_column, embedded_author_column, indicator_category_column, embedded_title_column, indicator_crossed_months_since_category_column] + user_factors + item_factors return feature_columns