def test_lstm(self): df_train, df_dev, df_test, metadata = get_fake_dataset(with_text_col=True) glove_file_path = 'resource/glove/glove.6B.50d.txt'# need be changed to where you store the pre-trained GloVe file. text_config = Mapping() text_config.mode = 'glove' text_config.max_words = 20 text_config.maxlen = 5 text_config.embedding_dim = 50 text_config.embeddings_index = open_glove(glove_file_path) # need to change encoder = Encoder(metadata, text_config=text_config) y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train) y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev) y_test, X_test_struc, X_test_text = encoder.transform(df_test) text_config.embedding_matrix = encoder.text_config.embedding_matrix model_config = get_fake_modelconfig('tmp/outputs_test') model_config.output_dir = os.path.join(model_config.output_dir, 'lstm') if not os.path.exists(model_config.output_dir): os.makedirs(model_config.output_dir) model = NeuralNetworkModel(text_config, model_config) output = model.train(y_train, X_train_struc, X_train_text, y_train, X_train_struc, X_train_text) # print(hist.history) # y_dev, X_dev_struc, X_dev_text) val_metric_true = 0.0 self.assertTrue(np.isclose(val_metric_true, output['val_metric'], atol=1e-4))
def test_textdata_only_tfidf(self): df_train, df_dev, df_test, metadata = get_fake_dataset(with_text_col=True, text_only=True) text_config = Mapping() text_config.mode = 'tfidf' text_config.max_words = 20 encoder = Encoder(metadata, text_config=text_config) y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train) y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev) y_test, X_test_struc, X_test_text = encoder.transform(df_test) model_config = get_fake_modelconfig('tmp/outputs_test') model_config.output_dir = os.path.join(model_config.output_dir, 'tfidf_text_only') if not os.path.exists(model_config.output_dir): os.makedirs(model_config.output_dir) model = NeuralNetworkModel(text_config, model_config) output = model.train(y_train, X_train_struc, X_train_text, y_train, X_train_struc, X_train_text) # print(hist.history) # y_dev, X_dev_struc, X_dev_text) val_metric_true = 0.0 self.assertTrue(np.isclose(val_metric_true, output['val_metric']))
def get_fake_linear_regression_modelconfig(output_path): model_config = Mapping() model_config.task_type = 'regression' ## 'classification' or 'regression' model_config.num_classes = 3 ## number of classes or number of outputs model_config.model_type = 'linear_regression' ## default is 'mlp', can be 'skip_connections' model_config.output_dir = output_path # model_config.C = 0.1 return model_config
def get_fake_rf_modelconfig(output_path): model_config = Mapping() model_config.task_type = 'classification' ## 'classification' or 'regression' model_config.num_classes = 3 ## number of classes or number of outputs model_config.model_type = 'random_forest' ## default is 'mlp', can be 'skip_connections' model_config.output_dir = output_path model_config.n_trees = 4 return model_config
def test_word_embedding(self): df_train, df_dev, df_test, metadata = get_fake_dataset( with_text_col=True) glove_file_path = 'resource/glove/glove.6B.50d.txt' # need be changed to where you store the pre-trained GloVe file. text_config = Mapping() text_config.mode = 'glove' text_config.max_words = 20 text_config.maxlen = 5 text_config.embedding_dim = 50 text_config.embeddings_index = open_glove(glove_file_path) encoder = Encoder(metadata, text_config=text_config) y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train) y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev) y_test, X_test_struc, X_test_text = encoder.transform(df_test) X_train_text_true = np.array([[9, 10, 11, 2, 3], [15, 16, 17, 18, 19], [1, 2, 1, 1, 3]]) X_train_struc_true = np.array([[-1.22474487, 1., 0., 0.], [0., 0., 1., 0.], [1.22474487, 0., 0., 1.]]) self.assertTrue(np.isclose(X_train_text_true, X_train_text).all()) self.assertTrue(np.isclose(X_train_struc_true, X_train_struc).all()) X_dev_text_true = np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1]]) X_dev_struc_true = np.array([[2.44948974, 0., 1., 0.], [6.12372436, 0., 1., 0.], [3.67423461, 0., 0., 1.]]) self.assertTrue(np.isclose(X_dev_text_true, X_dev_text).all()) self.assertTrue(np.isclose(X_dev_struc_true, X_dev_struc).all()) X_test_text_true = np.array([[14, 4, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]) X_test_struc_true = np.array([[0., 1., 0., 0.], [3.67423461, 0., 0., 1.], [1.22474487, 0., 0., 1.]]) self.assertTrue(np.isclose(X_test_text_true, X_test_text).all()) self.assertTrue(np.isclose(X_test_struc_true, X_test_struc).all())
def get_fake_modelconfig(output_path): model_config = Mapping() model_config.task_type = 'classification' ## 'classification' or 'regression' model_config.num_classes = 3 ## number of classes or number of outputs model_config.combine = 'concate' ## or 'attention' model_config.model_type = 'mlp' ## default is 'mlp', can be 'skip_connections' model_config.n_layers_dense = 2 model_config.hidden_size_dense = 16 model_config.n_layers_lstm = 2 model_config.hidden_size_lstm = 32 model_config.dropout_rate_lstm = 0.0 model_config.n_layers_output = 2 model_config.hidden_size_output = 32 model_config.optimizer = 'adam' ## 'adam', 'sgd', 'rmsprop' model_config.learning_rate = 0.01 model_config.clipnorm = 5.0 model_config.patience = 5 model_config.output_dir = output_path model_config.n_epochs = 10 model_config.batch_size = 1 model_config.verbose = 0 return model_config
def test_tfidf(self): df_train, df_dev, df_test, metadata = get_fake_dataset( with_text_col=True) text_config = Mapping() text_config.mode = 'tfidf' text_config.max_words = 20 print('*' * 20) print(text_config.mode) encoder = Encoder(metadata, text_config=text_config) y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train) y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev) y_test, X_test_struc, X_test_text = encoder.transform(df_test) X_train_text_true = np.array([[ 0., 0.69314718, 0.69314718, 0., 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0., 0., 0., 0., 0., 0., 0., 0., 0. ], [ 0., 0., 0., 1.55141507, 0., 0., 0., 0., 0., 0., 0., 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073, 0. ], [ 0., 0.69314718, 0.69314718, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.91629073 ]]) X_train_struc_true = np.array([[-1.22474487, 1., 0., 0.], [0., 0., 1., 0.], [1.22474487, 0., 0., 1.]]) self.assertTrue(np.isclose(X_train_text_true, X_train_text).all()) self.assertTrue(np.isclose(X_train_struc_true, X_train_struc).all()) X_dev_text_true = np.array([[ 0., 0., 0., 0.91629073, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ], [ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ], [ 0., 0., 0., 0., 0., 0., 0., 0., 0.91629073, 0.91629073, 0.91629073, 0., 0., 0., 0., 0., 0., 0., 0., 0. ]]) X_dev_struc_true = np.array([[2.44948974, 0., 1., 0.], [6.12372436, 0., 1., 0.], [3.67423461, 0., 0., 1.]]) self.assertTrue(np.isclose(X_dev_text_true, X_dev_text).all()) self.assertTrue(np.isclose(X_dev_struc_true, X_dev_struc).all()) X_test_text_true = np.array([[ 0., 0., 0., 1.55141507, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.91629073, 0., 0., 0., 0., 0., 0. ], [ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ], [ 0., 0., 0., 0.91629073, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]]) X_test_struc_true = np.array([[0., 1., 0., 0.], [3.67423461, 0., 0., 1.], [1.22474487, 0., 0., 1.]]) self.assertTrue(np.isclose(X_test_text_true, X_test_text).all()) self.assertTrue(np.isclose(X_test_struc_true, X_test_struc).all())
def __init__(self, text_config, model_config): self.text_config = Mapping(text_config) self.model_config = Mapping(model_config)