def main(in_dataset): # Hough and Schlangen 2015 config disf = DeepDisfluencyTagger( config_file= "deep_disfluency/deep_disfluency/experiments/experiment_configs.csv", config_number=21, saved_model_dir= "deep_disfluency/deep_disfluency/experiments/021/epoch_40") for idx, row in in_dataset.iterrows(): print ' '.join(row['utterance']) tagger_input = [(word, pos, None) for word, pos in zip(row['utterance'], row['pos'])] import pdb pdb.set_trace() print ' '.join(disf.tag_utterance(tagger_input))
def tag_words(txt_file): disf = DeepDisfluencyTagger( config_file="../deep_disfluency/experiments/experiment_configs.csv", config_number=21, saved_model_dir="../deep_disfluency/experiments/021/epoch_40") with open(txt_file, "r") as f: for lines in f: line = lines.split(".") for words in line: word = words.split() for i in word: disf.tag_new_word(i) tags = disf.output_tags return tags
def main(in_dataset, in_mode): # Hough and Schlangen 2015 config disf = DeepDisfluencyTagger( config_file= "deep_disfluency/deep_disfluency/experiments/experiment_configs.csv", config_number=21, saved_model_dir= "deep_disfluency/deep_disfluency/experiments/021/epoch_40") for key, value in eval_babi(disf, in_dataset).iteritems(): print '{}:\t{:.3f}'.format(key, value)
from deep_disfluency.tagger.deep_tagger import DeepDisfluencyTagger # Initialize the tagger from the config file with a config number # and saved model directory MESSAGE = """1. Disfluency tagging on pre-segmented utterances tags repair structure incrementally and other edit terms <e/> (Hough and Schlangen Interspeech 2015 with an RNN) """ print MESSAGE disf = DeepDisfluencyTagger(config_file="experiments/experiment_configs.csv", config_number=21, saved_model_dir="experiments/021/epoch_40") # Tag each word incrementally # Notice the incremental diff # Set diff_only to False if you want the whole utterance's tag each time with_pos = False print "tagging..." if with_pos: # if POS is provided use this: print disf.tag_new_word("john", pos="NNP") print disf.tag_new_word("likes", pos="VBP") print disf.tag_new_word("uh", pos="UH") print disf.tag_new_word("loves", pos="VBP") print disf.tag_new_word("mary", pos="NNP") else: # else the internal POS tagger tags the words incrementally print disf.tag_new_word("john") print disf.tag_new_word("likes") print disf.tag_new_word("uh") print disf.tag_new_word("loves")
word_graph[ new_output_start:], self.disf.get_output_tags(with_words=False)[ new_output_start:], range( new_output_start, len(word_graph)) ): print w, h, i if ("<e" in h or "<rps" in h) and \ i not in self.words_with_disfluency: self.label.config(text=str(self.counter.next())) self.master.update() self.words_with_disfluency.append(i) except: print "FAILED TO UPDATE" self.master.update() self.asr = IBMWatsonASR("credentials.json", new_word_hypotheses_handler) self.asr.listen() if __name__ == '__main__': disf = DeepDisfluencyTagger( config_file="deep_disfluency/experiments/experiment_configs.csv", config_number=35, saved_model_dir="deep_disfluency/experiments/035/epoch_6", use_timing_data=True ) root = Tk() my_gui = DisfluencyGUI(root, disf)
print 'Finished extracting features.' # 4. Train the model on the transcripts (and audio data if available) # NB each of these experiments can take up to 24 hours systems_best_epoch = {} if train_models: feature_matrices_filepath = THIS_DIR + '/../data/disfluency_detection/' + \ 'feature_matrices/train' validation_filepath = THIS_DIR + '/../data/disfluency_detection/' + \ 'feature_matrices/heldout' # train until convergence # on the settings according to the numbered experiments in # experiments/config.csv file for exp in experiments: disf = DeepDisfluencyTagger(config_file=THIS_DIR + "/experiment_configs.csv", config_number=exp) exp_str = '%03d' % exp e = disf.train_net(train_dialogues_filepath=feature_matrices_filepath, validation_dialogues_filepath=validation_filepath, model_dir=THIS_DIR + '/' + exp_str, tag_accuracy_file_path=THIS_DIR + '/results/tag_accuracies/{}.text'.format(exp_str)) systems_best_epoch[exp] = e else: # 33 RNN simple tags, disf + utt joint # 34 RNN complex tags, disf + utt joint # 35 LSTM simple tags, disf + utt joint # 36 LSTM complex tags, disf + utt joint # 37 LSTM simple tags, disf only # 38 LSTM simple tags, utt only
# (38, 8, 'LSTM (TTO only)'), # (39, 2, 'LSTM (complex tags)') # 4. Test the models on the test transcripts according to the best epochs # from training. # The output from the models is made in the folders # For now all use timing data if test_models: print "testing models..." for exp, best_epoch in sorted(systems_best_epoch.items(), lambda x: x[0]): exp_str = '%03d' % exp # load the model disf = DeepDisfluencyTagger( config_file=THIS_DIR + '/experiment_configs.csv', config_number=exp, saved_model_dir=THIS_DIR + '/{0}/epoch_{1}'.format(exp_str, best_epoch)) # simulating (or using real) ASR results # for now just saving these in the same folder as the best epoch # also outputs the speed partial_string = '_partial' if partial else '' disf.incremental_output_from_file( THIS_DIR + '/../data/disfluency_detection/switchboard/' + 'swbd_disf_heldout{}_data_timings.csv'.format(partial_string), target_file_path=THIS_DIR + '/{0}/epoch_{1}/'.format(exp_str, best_epoch) + 'heldout_output_increco.text') # 5. To get the numbers run the notebook: # experiments/analysis/EACL_2017/EACL_2017.ipynb