class CsvDataIteratorTest(unittest.TestCase): def setUp(self): print_info("==============") self.data_iterator = CsvDataIterator("conll_csv_experiments/", 32) def test_pad_sequences_level_one(self): seq = [["word1"], ["word1", "w2"], ["word1", "w2", "wd3"]] seq = ["{}".format(SEPERATOR).join(words) for words in seq] sequence_padded, sequence_length = self.data_iterator._pad_sequences( sequences=seq, pad_tok="{}{}".format(SEPERATOR, PAD_WORD), nlevels=1) np.testing.assert_array_equal(sequence_length, np.array([3, 3, 3])) self.assertEqual(sequence_padded[0], 'word1' + SEPERATOR + PAD_WORD + SEPERATOR + PAD_WORD) self.assertEqual(sequence_padded[1], 'word1~w2' + SEPERATOR + PAD_WORD) self.assertEqual(sequence_padded[2], 'word1~w2~wd3') def test_pad_sequences_level_two(self): char_2_id_map = { "w": 0, "o": 1, "r": 2, "d": 3, "1": 4, "2": 5, "3": 6 } sequences = [["word1"], ["word1", "w2"], ["word1", "w2", "wd3"]] char_ids = [] for seq in sequences: ids = [[char_2_id_map.get(c, 0) for c in str(word)] for word in seq] char_ids.append(ids) sequence_padded, sequence_length = self.data_iterator._pad_sequences( sequences=char_ids, pad_tok=int(PAD_CHAR_ID), nlevels=2, MAX_WORD_LENGTH=5) np.testing.assert_array_equal( sequence_length, np.array([[5, 0, 0], [5, 2, 0], [5, 2, 3]])) expected = np.array([[[0, 1, 2, 3, 4], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 1, 2, 3, 4], [0, 5, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 1, 2, 3, 4], [0, 5, 0, 0, 0], [0, 3, 6, 0, 0]]]) np.testing.assert_array_equal(sequence_padded, expected)
def load_estimator(self): estimator_config, estimator = TFEstimatorFactory.get("bilstm_crf_v0") if self.model_dir: config = estimator_config.load(self.model_dir) if config is None: # Fail safe estimator_config = estimator_config.with_user_hyperparamaters( EXPERIMENT_ROOT_DIR, self.preprocessor.OUT_DIR) else: estimator_config = config else: estimator_config = estimator_config.with_user_hyperparamaters( EXPERIMENT_ROOT_DIR, self.preprocessor.OUT_DIR) self.estimator = estimator(estimator_config) self.data_iterators = CsvDataIterator(self.preprocessor.OUT_DIR, batch_size=BATCH_SIZE)
class CsvTagger(): def __init__(self, model_name="bilstm_crf_v0", model_dir=None): self.preprocessor = None self.estimator = None self.data_iterators = None self.model_dir = model_dir self.model_name = model_name #fail safe if self.model_name == None: self.model_name = "bilstm_crf_v0" self.preprocessor = CoNLLDataPreprocessor( experiment_root_directory=EXPERIMENT_ROOT_DIR, over_write=None, use_iob=None, out_dir=None, train_df_path=None, val_df_path=None, test_df_path=None, text_col=None, entity_col=None, do_run_time_config=False) def load_estimator(self): estimator_config, estimator = TFEstimatorFactory.get(self.model_name) if self.model_dir: config = estimator_config.load(self.model_dir) if config is None: # Fail safe estimator_config = estimator_config.with_user_hyperparamaters( EXPERIMENT_ROOT_DIR, self.preprocessor.PREPROCESSED_DATA_DIR) else: estimator_config = config else: estimator_config = estimator_config.with_user_hyperparamaters( EXPERIMENT_ROOT_DIR, self.preprocessor.PREPROCESSED_DATA_DIR) self.estimator = estimator(estimator_config) self.data_iterators = CsvDataIterator( self.preprocessor.PREPROCESSED_DATA_DIR, batch_size=BATCH_SIZE) def preprocess(self): self.preprocessor.start() def train(self, batch_size, num_epochs): self.load_estimator() if self.estimator.FEATURE_NAME != self.data_iterators.FEATURE_NAME: print_error( "Given DataIterator can be used with choosed model. Try other models!!!" ) exit(1) self.data_iterators.prepare() num_samples = self.data_iterators.NUM_TRAINING_SAMPLES print_info(num_samples) max_steps = (num_samples // batch_size) * num_epochs print_info("Total number steps: {} ".format(max_steps)) for current_epoch in range(num_epochs): max_steps = (num_samples // batch_size) * (current_epoch + 1) train_hooks = [] train_hooks.append(self.data_iterators.train_data_init_hook) # if len(estimator.hooks) > 0: # train_hooks.extend(tagger.hooks) self.estimator.train( input_fn=self.data_iterators.train_data_input_fn, hooks=train_hooks, max_steps=max_steps) eval_results = self.estimator.evaluate( input_fn=self.data_iterators.val_data_input_fn, hooks=[self.data_iterators.val_data_init_hook ]) # tf_debug.LocalCLIDebugHook() print(eval_results) def predict_on_test_files(self, csv_files_path="data/test/"): # TODO handle the estimator behaviour in train, retrain and predict mode if self.estimator.FEATURE_NAME == self.data_iterators.FEATURE_NAME: self.load_estimator() self.data_iterators.predict_on_csv_files( estimator=self.estimator, csv_files_path=csv_files_path) else: print_error( "Given DataIterator can be used with choosed model. Try other models!!!" ) exit(1)
def setUp(self): print_info("==============") self.data_iterator = CsvDataIterator("conll_csv_experiments/", 32)