def populate_segs(db, trs_folder): print 'Populating db using trs files...' trs_filenames = glob.glob('%s*.trs' % (trs_folder)) trs_filenames.extend(glob.glob('%s*.its' % (trs_folder))) for i in range(len(trs_filenames)): print 'File %d of %d' % (i + 1, len(trs_filenames)) is_trs = trs_filenames[i].endswith('.trs') segs = None if is_trs: trs_parser = TRSParser(trs_filenames[i]) segs = trs_parser.parse(validate=False) else: trs_parser = ITSParser(trs_filenames[i]) segs = trs_parser.parse() #<hack for its files> utters = [] for s in segs: utters.extend(s.utters) segs = utters #</hack for its files> file_cd = os.path.basename(trs_filenames[i][:-4]).upper() for cur_seg in segs: #commented out for .its hack if is_trs: is_fuz = False j = 0 while not is_fuz and j < len(cur_seg.speakers): is_fuz = cur_seg.speakers[j].speaker_codeinfo.code == 'FUZ' j += 1 if is_fuz and len(cur_seg.speakers) > 1: print 'Warning: Found multi-speaker FUZ seg in file: "%s"' % ( os.path.basename(trs_filenames[i])) db.insert( 'trs_segs', 'file_cd start end speaker'.split(), ( ( file_cd, cur_seg.start, cur_seg.end, #commented out for its hack cur_seg.speakers[0].speaker_codeinfo.code if is_trs else cur_seg.speaker.speaker_codeinfo.code), )) print 'done.\n'
def process_dir(path, env, par_code, writer): trs_filenames = glob.glob('%s*.trs' % (path)) trans_count = 0 lena_count = 0 lena_db = Database('%s%s.db' % (lena_db_path, env)) for filename in trs_filenames: print '\n\tProcessing file %s' % (os.path.basename(filename)) parser = TRSParser(filename) segs = get_trans_segs(parser.parse()) if segs: zone_start = segs[0].start zone_end = segs[-1].end print '\tExamining range: %s (%0.2f) - %s (%0.2f)' % (get_time_str( zone_start), zone_start, get_time_str(zone_end), zone_end) trans_count += get_trans_child_vocs(segs) lena_count += get_lena_child_vocs(lena_db, filename, zone_start, zone_end) lena_db.close() writer.writerow([par_code, trans_count, lena_count])
def process_dir(full_item_path, output_dir): trs_filenames = glob.glob('%s*.trs' % (full_item_path)) matrix = build_matrix(TRANS_CODES, LENA_CODES) for filename in trs_filenames: print '\n\tProcessing file %s' % (os.path.basename(filename)) parser = TRSParser(filename) segs = get_trans_segs( parser.parse() ) if segs: print '\tExamining range: %s - %s' % (get_time_str(segs[0].start), get_time_str(segs[-1].end)) sm = StateMachine() single, numbered_multi, unnumbered_multi = sm.divide_segs(segs, use_lena_segmentation=False) count_single = len(single) count_numbered_multi = len(numbered_multi) count_unnumbered_multi = len(unnumbered_multi) count_angle_brackets = count_angle_bracket_segs(single) + count_angle_bracket_segs(numbered_multi) + count_angle_bracket_segs(unnumbered_multi) process_single(single, matrix) process_numbered_multi(numbered_multi, matrix) process_unnumbered_multi(unnumbered_multi, matrix) output_name = '%s%s-matrix.csv' % (output_dir, full_item_path.split('/')[-2]) output_matrix(matrix, output_name, count_single, count_numbered_multi, count_unnumbered_multi, count_angle_brackets)
def run(): LOGFILE = 'logs/confusion.log' #create log file if it doesn't exist check_log_file(LOGFILE) #set up logging logging.basicConfig(level=logging.ERROR, filename=LOGFILE, format='%(asctime)s %(message)s') #prefix each message with a timestamp for cur_env in envs: print 'Processing environment: %s' % (cur_env) output_dir = '%s%s/' % (output_path, cur_env) if not os.path.exists(output_dir): os.mkdir(output_dir) input_dir = '%s%s/' % (input_path, cur_env) dir_contents = os.listdir(input_dir) for item in dir_contents: full_item_path = '%s%s/' % (input_dir, item) if os.path.isdir(full_item_path): trs_filenames = glob.glob('%s*.trs' % (full_item_path)) matrix = build_matrix(TRANS_CODES, LENA_CODES) for filename in trs_filenames: print '\n\tProcessing file %s' % (os.path.basename(filename)) parser = TRSParser(filename) segs = get_trans_segs( parser.parse() ) if segs: print '\tExamining range: %s - %s' % (get_time_str(segs[0].start), get_time_str(segs[-1].end)) single, numbered_multi, unnumbered_multi = divide_segs(segs) count_single = len(single) count_numbered_multi = len(numbered_multi) count_unnumbered_multi = len(unnumbered_multi) count_angle_brackets = count_angle_bracket_segs(single) + count_angle_bracket_segs(numbered_multi) + count_angle_bracket_segs(unnumbered_multi) process_single(single, matrix) process_numbered_multi(numbered_multi, matrix) process_unnumbered_multi(unnumbered_multi, matrix) output_name = '%s%s-matrix.csv' % (output_dir, full_item_path.split('/')[-2]) output_matrix(matrix, output_name, count_single, count_numbered_multi, count_unnumbered_multi, count_angle_brackets)
class VerificationWindow(): ERROR_STATES = Enum(['NONE', 'WARNING', 'ERROR']) def __init__(self, filename, progress_dialog): self.logger = logging.getLogger(__name__) self.window = gtk.Window(gtk.WindowType.TOPLEVEL) self.window.set_title('Transcription Verifier') self.window.connect('destroy', lambda x: self.window.destroy()) self.window.set_border_width(10) self.window.set_default_size(580, 500) self.trs_parser = TRSParser(filename) self.trs_parser.parse( progress_update_fcn=progress_dialog.set_fraction, progress_next_phase_fcn=progress_dialog.next_phase, remove_bad_trans_codes=False) self.wav_parser = None progress_dialog.next_phase() self.filter_errors = True self.toolbar = self.build_toolbar() self.treeview = self.build_treeview(progress_dialog.set_fraction) self.treeview.expand_all() scrolled_win = gtk.ScrolledWindow() scrolled_win.set_policy(gtk.PolicyType.AUTOMATIC, gtk.PolicyType.AUTOMATIC) scrolled_win.add(self.treeview) vbox = gtk.VBox(False, 2) vbox.pack_start(self.toolbar, False, False, 0) vbox.pack_start(scrolled_win, True, True, 0) self.window.add(vbox) self.window.show_all() def build_toolbar(self): toolbar = gtk.Toolbar() toolbar.set_orientation(gtk.Orientation.HORIZONTAL) filter_errors_button = gtk.ToggleToolButton() filter_errors_button.set_active( True ) #set this before the connecting the clicked handler so it doesn't cause trouble filter_errors_button.connect( 'toggled', lambda w: self.toggle_filter_errors(w.get_active())) filter_errors_icon = gtk.Image() filter_errors_icon.set_from_file( UIUtils.get_icon_path(UIUtils.BUTTON_ICONS.FLAG)) filter_errors_button.set_label('Show Errors Only') filter_errors_button.set_icon_widget(filter_errors_icon) expand_button = gtk.ToolButton() expand_icon = gtk.Image() expand_icon.set_from_file( UIUtils.get_icon_path(UIUtils.BUTTON_ICONS.EXPAND)) expand_button.set_label('Expand All') expand_button.set_icon_widget(expand_icon) expand_button.connect('clicked', lambda w: self.treeview.expand_all()) collapse_button = gtk.ToolButton() collapse_icon = gtk.Image() collapse_icon.set_from_file( UIUtils.get_icon_path(UIUtils.BUTTON_ICONS.COLLAPSE)) collapse_button.set_label('Collapse All') collapse_button.set_icon_widget(collapse_icon) collapse_button.connect('clicked', lambda w: self.treeview.collapse_all()) rescan_button = gtk.ToolButton() rescan_icon = gtk.Image() rescan_icon.set_from_file( UIUtils.get_icon_path(UIUtils.BUTTON_ICONS.REFRESH)) rescan_button.set_label('Rescan File') rescan_button.set_icon_widget(rescan_icon) rescan_button.connect('clicked', lambda w: self._rescan_file()) play_seg_button = gtk.ToolButton() play_icon = gtk.Image() play_icon.set_from_file( UIUtils.get_icon_path(UIUtils.BUTTON_ICONS.PLAY)) play_seg_button.set_label('Play Seg') play_seg_button.set_icon_widget(play_icon) play_seg_button.connect('clicked', lambda w: self.play_selected_seg()) close_button = gtk.ToolButton() close_icon = gtk.Image() close_icon.set_from_file( UIUtils.get_icon_path(UIUtils.BUTTON_ICONS.CLOSE)) close_button.set_label('Close') close_button.set_icon_widget(close_icon) close_button.connect('clicked', lambda w: self.window.destroy()) exit_button = gtk.ToolButton() exit_icon = gtk.Image() exit_icon.set_from_file( UIUtils.get_icon_path(UIUtils.BUTTON_ICONS.EXIT)) exit_button.set_label('Exit') exit_button.set_icon_widget(exit_icon) exit_button.connect('clicked', lambda w: gtk.main_quit()) toolbar.insert(filter_errors_button, -1) toolbar.insert(expand_button, -1) toolbar.insert(collapse_button, -1) toolbar.insert(gtk.SeparatorToolItem(), -1) toolbar.insert(play_seg_button, -1) toolbar.insert(rescan_button, -1) toolbar.insert(gtk.SeparatorToolItem(), -1) toolbar.insert(close_button, -1) toolbar.insert(exit_button, -1) return toolbar def _rescan_file(self): self.window.set_sensitive(False) progress_dialog = ProgressDialog( 'Processing File...', ['Parsing trs file...', 'Validating data...', 'Building UI...']) progress_dialog.show() #this causes the parser to invalidate all cache, re-open and re-parse the file self.trs_parser.re_parse( progress_update_fcn=progress_dialog.set_fraction, progress_next_phase_fcn=progress_dialog.next_phase) #build a new treeview model based on the new data progress_dialog.next_phase() filter_model = self._build_tree_store(progress_dialog.set_fraction) self.treeview.set_model(filter_model) #Presumably the most common cause for rescanning is to check if errors have been fixed. #If the error filter is on, automatically expand all rows to show any remaining errors. if self.filter_errors: self.treeview.expand_all() self.window.set_sensitive(True) def _build_tree_store(self, progress_update_fcn): #segment/utter id, description, error_state (0 = none, 1 = warning, 2 = error) tree_store = gtk.TreeStore(gobject.TYPE_INT, gobject.TYPE_STRING, gobject.TYPE_INT) #note: these may be errors or warnings cur_utter = 0 for seg in self.trs_parser.parse(): seg_speakers = '' if seg.speakers: for i in range(len(seg.speakers)): seg_speakers += seg.speakers[i].speaker_codeinfo.get_code() if i < len(seg.speakers) - 1: seg_speakers += ' + ' else: seg_speakers = ' - ' seg_iter = tree_store.append(None, [ seg.num, '%s [%s - %s]' % (seg_speakers, BackendUtils.get_time_str( seg.start), BackendUtils.get_time_str(seg.end)), VerificationWindow.ERROR_STATES.NONE ]) for utter in seg.utters: speaker_cd = '?' #question mark indicates an error occured - if we have utter.speaker, we should have an utter code. Errors occur if the utter code isn't in the DB lookup table (which means that utter.speaker != None, but utter.speaker.speaker_codeinfo == None. This is the condition that falls through the if-else blocks below). if utter.speaker: if utter.speaker.speaker_codeinfo: speaker_cd = utter.speaker.speaker_codeinfo.get_code() else: speaker_cd = ' - ' desc_str = '%s [%s - %s]' % ( speaker_cd, BackendUtils.get_time_str( utter.start), BackendUtils.get_time_str(utter.end)) if utter.lena_notes: desc_str += ' %s' % (utter.lena_notes) if utter.trans_phrase: desc_str += ' %s' % (utter.trans_phrase) if utter.lena_codes: desc_str += ' |%s|' % ('|'.join(utter.lena_codes)) if utter.trans_codes: if not utter.lena_codes: desc_str += ' |' desc_str += '%s|' % ('|'.join(utter.trans_codes)) utter_iter = tree_store.append( seg_iter, [utter.id, desc_str, VerificationWindow.ERROR_STATES.NONE]) cur_utter += 1 progress_update_fcn( float(cur_utter) / float(self.trs_parser.total_utters)) error_list = self.trs_parser.error_collector.get_errors_by_utter( utter) for error in error_list: error_type = VerificationWindow.ERROR_STATES.ERROR if isinstance(error, ParserWarning): error_type = VerificationWindow.ERROR_STATES.WARNING error_iter = tree_store.append( utter_iter, [-1, '%s' % (error.msg), error_type]) parent_it = utter_iter while parent_it: parent_error_type = tree_store.get_value(parent_it, 2) if parent_error_type < error_type: tree_store.set_value(parent_it, 2, error_type) parent_it = tree_store.iter_parent(parent_it) filter_model = tree_store.filter_new() filter_model.set_visible_func(self.filter) return filter_model def build_treeview(self, progress_update_fcn): filter_model = self._build_tree_store(progress_update_fcn) treeview = gtk.TreeView(filter_model) col = gtk.TreeViewColumn('ID', gtk.CellRendererText(), text=0) col.set_visible(False) treeview.append_column(col) renderer = gtk.CellRendererText() col = gtk.TreeViewColumn('Description', renderer, text=1) col.set_cell_data_func(renderer, self.cell_render_fcn) treeview.append_column(col) col = gtk.TreeViewColumn('Error State', gtk.CellRendererText(), text=2) col.set_visible(False) treeview.append_column(col) return treeview def cell_render_fcn(self, col, cell_renderer, model, it, user_data=None): error_state = model.get_value(it, 2) if error_state == VerificationWindow.ERROR_STATES.WARNING: cell_renderer.set_property('foreground', 'orange') elif error_state == VerificationWindow.ERROR_STATES.ERROR: cell_renderer.set_property('foreground', 'red') else: cell_renderer.set_property('foreground', 'black') return #returns true if row pointed to by 'it' should be visible def filter(self, model, it, user_data): result = True if self.filter_errors: result = model.get_value(it, 2) > VerificationWindow.ERROR_STATES.NONE return result def toggle_filter_errors(self, filter_errors): self.filter_errors = not self.filter_errors self.treeview.get_model().refilter() def play_selected_seg(self): (model, it) = self.treeview.get_selection().get_selected() if it: #if they've selected an error row, find the top level parent (the segment) and use it instead parent = model.iter_parent(it) while parent: it = parent parent = model.iter_parent(it) seg_num = model.get_value(it, 0) if it else None seg = self.trs_parser.parse()[seg_num] if not self.wav_parser: dialog = gtk.FileChooserDialog( title='Select WAV File', action=gtk.FileChooserAction.OPEN, buttons=(gtk.STOCK_CANCEL, gtk.ResponseType.CANCEL, gtk.STOCK_OPEN, gtk.ResponseType.OK)) dialog.set_default_response(gtk.ResponseType.OK) for filter_opt in (('wav Files', '*.wav'), ('All Files', '*')): file_filter = gtk.FileFilter() file_filter.set_name(filter_opt[0]) file_filter.add_pattern(filter_opt[1]) dialog.add_filter(file_filter) response = dialog.run() if response == gtk.ResponseType.OK: filename = dialog.get_filename() self.wav_parser = WavParser(filename) dialog.destroy() if self.wav_parser: self.wav_parser.play_seg(seg) else: UIUtils.show_no_sel_dialog() else: UIUtils.show_no_sel_dialog()
class FreqWindow(): def __init__(self, filename, progress_dialog): self.window = gtk.Window(gtk.WindowType.TOPLEVEL) self.window.set_title('WH-Frequency Counter') self.window.set_border_width(10) self.window.set_default_size(730, 400) self.logger = logging.getLogger(__name__) self.count_cols = self._get_initial_count_cols() self.trs_parser = TRSParser(filename) segments = self.trs_parser.parse( progress_update_fcn=progress_dialog.set_fraction, progress_next_phase_fcn=progress_dialog.next_phase, validate=False, seg_filters=[]) self.filter_manager = FilterManager( segments ) #this object caches original segs and helps with lookup by segment number calc = CountOutputCalc('', CountOutputCalc.COUNT_TYPES.PER_SEG, 1) self.output = Output( '', '', [WHQFilter()], calc, False ) #this object filters and allows us to retrieve the filtered segs map(lambda seg: self.output.add_item(seg), segments) treeview = self._build_treeview() #ensure progress dialog self-destructs even if no utterances are found (in that case the above call never invokes progress_dialog.set_fraction) progress_dialog.ensure_finish() scrolled_win = gtk.ScrolledWindow() scrolled_win.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) scrolled_win.add(treeview) export_button = UIUtils.create_button('Export Results', UIUtils.BUTTON_ICONS.EXPORT) export_button.connect('clicked', lambda widget: self._export_results(treeview)) close_button = UIUtils.create_button('Close', UIUtils.BUTTON_ICONS.CLOSE) close_button.connect('clicked', lambda w: self.window.destroy()) add_button = UIUtils.create_button('Add Count Column', UIUtils.BUTTON_ICONS.ADD) add_button.connect('clicked', lambda w: self._add_count_col(treeview)) self.remove_button = UIUtils.create_button('Remove Count Column', UIUtils.BUTTON_ICONS.REMOVE) self.remove_button.connect('clicked', lambda w: self._remove_count_col(treeview)) self._update_remove_button_state() options_frame = gtk.Frame(label='Options') options_vbox = gtk.VBox() self.linked_checkbox = gtk.CheckButton('Group Linked Segments') self.linked_checkbox.connect('toggled', self._toggle_seg_grouping, treeview) options_vbox.pack_start(self.linked_checkbox, False, False, 0) self.context_checkbox = gtk.CheckButton('Show Context') self.context_checkbox.connect('toggled', self._toggle_show_context, treeview) options_vbox.pack_start(self.context_checkbox, False, False, 0) options_frame.add(options_vbox) self.statusbar = gtk.Statusbar() self.statusbar.set_has_resize_grip(False) self.num_whq = treeview.get_model().iter_n_children(None) self._update_statusbar() vbox = gtk.VBox() bbox = gtk.HButtonBox() bbox.pack_start(export_button, True, False, 0) bbox.pack_start(add_button, True, False, 0) bbox.pack_start(self.remove_button, True, False, 0) bbox.pack_start(close_button, True, False, 0) vbox.pack_start(scrolled_win, True, True, 0) vbox.pack_start(self.statusbar, False, False, 0) vbox.pack_end(bbox, False, False, 0) vbox.pack_end(options_frame, False, False, 0) self.window.add(vbox) self.window.show_all() def _get_initial_count_cols(self): return map(lambda word: [word.capitalize(), word, 0], 'who what why when where how'.split()) def _toggle_show_context(self, checkbox, treeview): tree_model = self._build_list_store( link_segs=self.linked_checkbox.get_active(), prev_store=treeview.get_model(), show_context=self.context_checkbox.get_active()) treeview.set_model(tree_model) def _toggle_seg_grouping(self, checkbox, treeview): tree_model = self._build_list_store( link_segs=self.linked_checkbox.get_active(), prev_store=None, show_context=self.context_checkbox.get_active()) treeview.set_model(tree_model) self.num_whq = treeview.get_model().iter_n_children(None) self._update_statusbar() def _update_remove_button_state(self): self.remove_button.set_sensitive(len(self.count_cols) > 0) def _remove_count_col(self, treeview): dialog = gtk.Dialog(title='Remove Count Column', buttons=(gtk.STOCK_CANCEL, gtk.ResponseType.CANCEL, gtk.STOCK_OK, gtk.ResponseType.OK)) dialog.set_default_response(gtk.ResponseType.OK) vbox = dialog.get_content_area() list_store = gtk.ListStore(gobject.TYPE_STRING) for i in range(len(self.count_cols)): list_store.append([self.count_cols[i][0]]) combo = gtk.ComboBox(model=list_store) renderer = gtk.CellRendererText() combo.pack_start(renderer, True, True, 0) combo.add_attribute(renderer, 'text', 0) combo.set_active(0) hbox = gtk.HBox() hbox.pack_start(gtk.Label('Select Column:'), True, True, 0) hbox.pack_start(combo, True, True, 0) vbox.pack_start(hbox, True, True, 0) vbox.show_all() response = dialog.run() if response == gtk.ResponseType.CANCEL: dialog.destroy() done = True elif response == gtk.ResponseType.OK: col_index = combo.get_active() if col_index >= 0: dialog.destroy() self.count_cols = self.count_cols[:col_index] + self.count_cols[ col_index + 1:] progress_dialog = ProgressDialog('Removing Column...', ['Rebuilding UI...']) progress_dialog.show() tree_model = self._build_list_store( link_segs=self.linked_checkbox.get_active(), prev_store=treeview.get_model(), show_context=self.context_checkbox.get_active()) old_col = treeview.get_column(6 + col_index) treeview.remove_column(old_col) #update the 'text' property of the cell renderers in all columns after the removed column - otherwise cell values get mixed up i = 6 + col_index while i < tree_model.get_n_columns(): col = treeview.get_column(i) renderer = col.get_cell_renderers()[0] col.set_attributes(renderer, text=i) i += 1 treeview.set_model(tree_model) self._update_remove_button_state() self._update_statusbar() progress_dialog.ensure_finish() def _add_count_col(self, treeview): dialog = gtk.Dialog(title='Add Count Column', buttons=(gtk.STOCK_CANCEL, gtk.ResponseType.CANCEL, gtk.STOCK_OK, gtk.ResponseType.OK)) dialog.set_default_response(gtk.ResponseType.OK) vbox = dialog.get_content_area() #table = gtk.Table(2, 2) grid = gtk.Grid() name_label = gtk.Label('Column Name:') #table.attach(name_label, 0, 1, 0, 1, gtk.EXPAND, gtk.EXPAND, 3, 3) grid.attach(name_label, 0, 0, 1, 1, 3) name_entry = gtk.Entry() #table.attach(name_entry, 1, 2, 0, 1, gtk.EXPAND, gtk.EXPAND, 3, 3) grid.attach(name_entry, 1, 0, 1, 1, 3) regex_label = gtk.Label('Search term:') #table.attach(regex_label, 0, 1, 1, 2, gtk.EXPAND, gtk.EXPAND, 3, 3) grid.attach(regex_label, 0, 1, 1, 1, 3) regex_entry = gtk.Entry() #table.attach(regex_entry, 1, 2, 1, 2, gtk.EXPAND, gtk.EXPAND, 3, 3) grid.attach(regex_entry, 1, 1, 1, 1, 3) vbox.pack_start(grid, True, True, 0) vbox.show_all() done = False while not done: response = dialog.run() if response == gtk.ResponseType.CANCEL: dialog.destroy() done = True elif response == gtk.ResponseType.OK: name = name_entry.get_text() regex = regex_entry.get_text() try: re.compile(regex) dialog.destroy() self.count_cols.append([name, regex, 0]) #name, regex, total progress_dialog = ProgressDialog( 'Adding New Column...', ['Counting occurrances...']) progress_dialog.show() list_store = self._build_list_store( link_segs=self.linked_checkbox.get_active(), prev_store=treeview.get_model(), show_context=self.context_checkbox.get_active()) progress_dialog.ensure_finish() treeview.set_model(list_store) col = gtk.TreeViewColumn(name, gtk.CellRendererText(), text=list_store.get_n_columns() - 1) treeview.append_column(col) self._update_remove_button_state() self._update_statusbar() done = True except Exception as error: if isinstance(error, sre_constants.error): error_dialog = gtk.MessageDialog( buttons=(gtk.ButtonType.OK), message_format= 'The regular expression that has been entered is invalid.' ) error_dialog.run() error_dialog.destroy() else: error_dialog = gtk.MessageDialog( buttons=(gtk.ButtonType.OK), message_format= 'The application has encountered an internal error. Please contact your local programmer to assign blame.' ) error_dialog.run() error_dialog.destroy() done = True if progress_dialog: progress_dialog.destroy() self.logger.error( 'Exception in add_column():\n %s\nStacktrace: %s' % (error, traceback.format_exc())) def _update_statusbar(self): context_id = self.statusbar.get_context_id('num_whq') self.statusbar.pop(context_id) totals = 'Totals: WHQ Count: %d' % (self.num_whq) for col in self.count_cols: totals += ', %s: %d' % (col[0], col[2]) self.statusbar.push(context_id, totals) def _get_link_chain(self, cur_seg): cur = cur_seg chain = [] while cur != None: chain.insert(cur, 0) cur = cur.prev cur = cur_seg.next while cur != None: chain.append(cur) return chain def _build_list_store_row(self, utter_id, start_time, end_time, trans_phrase, speaker_str, target_str, whq_count): start_time = ('%0.2f' % (start_time)) if start_time != None else '' end_time = ('%0.2f' % (end_time)) if end_time != None else '' return [ utter_id, '%s - %s' % (start_time, end_time), trans_phrase, speaker_str, target_str, whq_count, ] def _find_utter_index(self, utter): utter_index = -1 i = 0 while i < len(utter.seg.utters) and utter_index < 0: if utter.seg.utters[i] == utter: utter_index = i i += 1 return utter_index def _append_context(self, bwd_start_utter, fwd_start_utter, cur_phrase): #backward bwd_phrase = self._get_adjacent_phrase(bwd_start_utter, -1) fwd_phrase = self._get_adjacent_phrase(fwd_start_utter, 1) return '(%s)\n%s\n(%s)' % (bwd_phrase, cur_phrase, fwd_phrase) def _get_adjacent_phrase(self, start_utter, incr): utter_index = self._find_utter_index(start_utter) + incr seg_index = start_utter.seg.num phrase = None i_in_bounds = None init_j = None if incr < 0: i_in_bounds = lambda i: i >= 0 init_j = lambda i, seg: utter_index if i == seg_index else len( seg.utters) - 1 j_in_bounds = lambda j, seg: j >= 0 else: i_in_bounds = lambda i: i < len(self.filter_manager.get_segs()) init_j = lambda i, seg: utter_index if i == seg_index else 0 j_in_bounds = lambda j, seg: j < len(seg.utters) i = seg_index while i_in_bounds(i) and not phrase: seg = self.filter_manager.get_seg_by_num(i) j = init_j(i, seg) while j_in_bounds(j, seg) and not phrase: phrase = seg.utters[j].trans_phrase j += incr i += incr return phrase or '-' def _build_list_store(self, link_segs=False, prev_store=None, show_context=False): #for now, we always grab segs and convert to chains later if needed segments = self.output.get_filtered_items() list_store = gtk.ListStore( gobject.TYPE_INT, #utterance id gobject.TYPE_STRING, #time gobject.TYPE_STRING, #phrase gobject.TYPE_STRING, #speakers gobject.TYPE_STRING, #target listeners gobject.TYPE_INT, #whq count *([gobject.TYPE_INT] * len(self.count_cols) ) #user-defined 'count columns' ) row_num = 0 if link_segs: utter_chains = FilterManager.get_chains(segments) for head in utter_chains: cur = head prev = cur trans_phrase = cur.trans_phrase speaker_str = DBConstants.SPEAKER_CODES.get_option( cur.speaker.get_codeinfo().get_code( )).desc if cur.speaker else '(Unknown)' target_str = DBConstants.TRANS_CODES[1].get_option( cur.trans_codes[1] ).desc if cur.trans_codes else '(Unknown)' cur = cur.next count_col_vals = [0] * len(self.count_cols) while cur: trans_phrase += '\n->%s' % (cur.trans_phrase) if cur.speaker: speaker_str += ', %s' % ( DBConstants.SPEAKER_CODES.get_option( cur.speaker.get_codeinfo().get_code()).desc) if cur.trans_codes: target_str += ', %s' % ( DBConstants.TRANS_CODES[1].get_option( cur.trans_codes[1]).desc) prev = cur cur = cur.next tail = FilterManager.get_endpoint( FilterManager.ENDPOINT_TYPES.TAIL, head) if show_context: trans_phrase = self._append_context( head, tail, trans_phrase) whq_count = prev_store[row_num][5] if prev_store else 1 row = self._build_list_store_row(head.id, head.start, tail.end, trans_phrase, speaker_str, target_str, whq_count) for j in range(len(self.count_cols)): count = len(re.findall(self.count_cols[j][1], trans_phrase)) #reset column total on first iteration (if _build_list_store() was called in the past, then self.count_cols[j][2] may be > 0) self.count_cols[j][2] = self.count_cols[j][ 2] + count if row_num else count row.append(count) list_store.append(row) row_num += 1 else: for i in range(len(segments)): for utter in segments[i].utters: trans_phrase = utter.trans_phrase if show_context: trans_phrase = self._append_context( utter, utter, trans_phrase) whq_count = prev_store[row_num][5] if prev_store else 1 speaker_str = DBConstants.SPEAKER_CODES.get_option( utter.speaker.speaker_codeinfo.get_code( )).desc if utter.speaker else '(Unknown)' target_str = DBConstants.TRANS_CODES[1].get_option( utter.trans_codes[1] ).desc if utter.trans_codes else '(Unknown)' row = self._build_list_store_row(utter.id, utter.start, utter.end, trans_phrase, speaker_str, target_str, whq_count) for j in range(len(self.count_cols)): count = len( re.findall(self.count_cols[j][1], utter.trans_phrase.lower())) #reset column total on first iteration (if _build_list_store() was called in the past, then self.count_cols[j][2] may be > 0) self.count_cols[j][2] = self.count_cols[j][ 2] + count if row_num else count row.append(count) list_store.append(row) row_num += 1 return list_store def _build_treeview(self): list_store = self._build_list_store() treeview = gtk.TreeView(list_store) #create hidden id column col = gtk.TreeViewColumn('ID', gtk.CellRendererText(), text=0) col.set_visible(False) col.set_resizable(True) treeview.append_column(col) col_names = ['Time', 'Phrase', 'Speakers', 'Target Listeners'] for i in range(len(col_names)): col = gtk.TreeViewColumn(col_names[i], gtk.CellRendererText(), text=(i + 1)) col.set_resizable(True) treeview.append_column(col) spin_renderer = gtk.CellRendererSpin() adj = gtk.Adjustment(value=1, lower=0, upper=100, page_incr=5, step_incr=1, page_size=0) spin_renderer.set_property('adjustment', adj) spin_renderer.set_property('editable', True) spin_renderer.connect('edited', self._update_row, treeview) col = gtk.TreeViewColumn('WHQ Count', spin_renderer, text=(len(col_names) + 1)) col.set_resizable(True) treeview.append_column(col) for i in range(len(self.count_cols)): col = gtk.TreeViewColumn(self.count_cols[i][0], gtk.CellRendererText(), text=(len(col_names) + 2 + i)) col.set_resizable(True) treeview.append_column(col) treeview.connect('key-press-event', self._keypress_callback, treeview) return treeview def _keypress_callback(self, widget, event, treeview): if gdk.keyval_name(event.keyval).lower() == 'tab': (model, paths) = treeview.get_selection().get_selected_rows() total_rows = model.iter_n_children(None) if paths and paths[0][0] + 1 < total_rows: treeview.set_cursor(paths[0][0] + 1, focus_column=treeview.get_column(3), start_editing=True) def _update_row(self, widget, path, value, treeview): #we must retrieve the model each time this method is called (rather than just passing in a reference to it), since the model is re-defined ever time a count column is added or removed model = treeview.get_model() old_val = int(model[path][5]) new_val = int(value) self.num_whq += (new_val - old_val) model[path][5] = new_val self._update_statusbar() def _export_results(self, treeview): dialog = gtk.FileChooserDialog( title='Save', action=gtk.FileChooserAction.SAVE, buttons=(gtk.STOCK_CANCEL, gtk.ResponseType.CANCEL, gtk.STOCK_SAVE, gtk.ResponseType.OK)) dialog.set_default_response(gtk.ResponseType.OK) dialog.add_filter(UIUtils.CSV_FILE_FILTER) dialog.add_filter(UIUtils.ALL_FILE_FILTER) #splice in the 'open immediately checkbox' content_area = dialog.get_content_area() open_now_checkbox = gtk.CheckButton('Open Immediately') open_now_checkbox.set_active(True) align = gtk.Alignment(xalign=1.0, yalign=1.0) align.add(open_now_checkbox) content_area.pack_end(align, False, False, 0) open_now_checkbox.show() align.show() response = dialog.run() if response == gtk.ResponseType.CANCEL: dialog.destroy() elif response == gtk.ResponseType.OK: filename = dialog.get_filename() open_now = open_now_checkbox.get_active() dialog.destroy() count_col_headers, count_col_vals, count_col_totals = zip( *self.count_cols) if self.count_cols else [[]] * 3 exporter = FreqExporter(filename, self.trs_parser.filename) exporter.write_header_row(count_col_headers) list_store = treeview.get_model() tree_it = list_store.get_iter_first() while tree_it: #we must remove newline chars, otherwise Excel thinks it's the end of a row (even when it's quoted...) phrase = list_store.get_value(tree_it, 2).replace('\n', ' ').replace( '\r', '') time_str = list_store.get_value(tree_it, 1) speakers_str = list_store.get_value(tree_it, 3) or '(Unknown)' targets_str = list_store.get_value(tree_it, 4) or '(Unknown)' num_utters = int(list_store.get_value(tree_it, 5)) i = 6 count_col_vals = [] while i < list_store.get_n_columns(): count_col_vals.append(int(list_store.get_value(tree_it, i))) i += 1 exporter.write_count_row(time_str, phrase, speakers_str, targets_str, num_utters, count_col_vals) tree_it = list_store.iter_next(tree_it) exporter.finish(self.num_whq, count_col_totals) if open_now: subprocess.Popen( ['%s' % DBConstants.SETTINGS.SPREADSHEET_PATH, filename]) else: result_dialog = gtk.MessageDialog( buttons=gtk.ButtonType.OK, message_format='Results exported successfully.') result_dialog.run() result_dialog.destroy()
def export(self, progress_update_fcn=None, progress_next_phase_fcn=None): #create csv file export_file = open(self.export_filename, 'wb') #write header info csv_writer = csv.writer(export_file, quoting=csv.QUOTE_ALL) #use Python csv library csv_writer.writerow( ['Export Date: %s' % (UIUtils.get_cur_timestamp_str())]) csv_writer.writerow( ['Configuration Creation Date: %s' % (self.config.created)]) csv_writer.writerow(['TRS Filename: %s' % (self.trs_filename)]) csv_writer.writerow(['Output Configuration:']) csv_writer.writerow(['Name: %s' % (self.config.name)]) csv_writer.writerow(['Description: %s' % (self.config.desc)]) csv_writer.writerow(['']) csv_writer.writerow(['Outputs:']) csv_writer.writerow(['']) #parse the trs file trs_parser = TRSParser(self.trs_filename) segs = trs_parser.parse(progress_update_fcn, progress_next_phase_fcn, validate=False) chains = None #this is populated on demand, then cached summary_row = [os.path.basename(self.trs_filename)[:-4]] summary_head = ["TRS file"] #iterate through all outputs in the configuration, adding segments/chains to each one, then writing the output to the spreadsheet file i = 0 while i < len(self.config.outputs): #update progress bar text if progress_next_phase_fcn: progress_next_phase_fcn() cur_output = self.config.outputs[i] cur_output.reset() #clear any cached utterances from previous runs #if we need chains, parse them from the segment list if cur_output.chained and not chains: chains = FilterManager.get_chains(segs) #add chains/segments to the current output items = chains if cur_output.chained else segs j = 0 while j < len(items): cur_output.add_item( items[j], filter_utters=True ) #note: filter_utters only affects segs (not chains) j += 1 #note: updating progress individually within the above loop (for every iteration of j) slows down the processing considerably (by a factor of ~4) - a compromise is to just set each phase to 100% after it completes. if progress_update_fcn: progress_update_fcn(1) #grab the output's results and write them to the file cur_output.write_csv_rows(csv_writer) # get summary from output summary_head += [cur_output.name] summary_row += [cur_output.get_summary()] csv_writer.writerow(['']) i += 1 export_file.close() if len(self.summary_filename) > 0: need_head = False # check the existence of file, decide the header if not os.path.isfile(self.summary_filename): need_head = True with open(self.summary_filename, 'at') as fp: summary_writer = csv.writer(fp, quoting=csv.QUOTE_ALL) if need_head: summary_writer.writerow(summary_head) summary_writer.writerow(summary_row)
def process_dir(path, env, par_code, writer): trs_filenames = glob.glob('%s*.trs' % (path)) utter_counts = [0] * len(container_types) word_counts = [0] * len(container_types) lena_db = Database('%s%s.db' % (lena_db_path, env)) for filename in trs_filenames: print '\n\tProcessing file %s' % (os.path.basename(filename)) parser = TRSParser(filename) segs = get_trans_segs( parser.parse() ) if segs: print '\tExamining range: %s (%0.2f) - %s (%0.2f)' % (get_time_str(segs[0].start), segs[0].start, get_time_str(segs[-1].end), segs[-1].end) sm = StateMachine() single, numbered_multi, unnumbered_multi = sm.divide_segs(segs) #for non-overlapping (no numbered_multi) trans_awc, lena_awc, utter_count = get_trans_awc(single, lena_db, filename, exclude_angle=True) word_counts[container_types.TRANS_NO_OVERLAP] += trans_awc word_counts[container_types.LENA_NO_OVERLAP] += lena_awc utter_counts[container_types.TRANS_NO_OVERLAP] += utter_count trans_awc, lena_awc, utter_count = get_trans_awc(unnumbered_multi, lena_db, filename, exclude_angle=True) word_counts[container_types.TRANS_NO_OVERLAP] += trans_awc word_counts[container_types.LENA_NO_OVERLAP] += lena_awc utter_counts[container_types.TRANS_NO_OVERLAP] += utter_count #for all speech trans_awc, lena_awc, utter_count = get_trans_awc(single, lena_db, filename, exclude_angle=False) word_counts[container_types.TRANS_ALL_SPEECH] += trans_awc word_counts[container_types.LENA_ALL_SPEECH] += lena_awc utter_counts[container_types.TRANS_ALL_SPEECH] += utter_count trans_awc, lena_awc, utter_count = get_trans_awc(numbered_multi, lena_db, filename, exclude_angle=False) word_counts[container_types.TRANS_ALL_SPEECH] += trans_awc word_counts[container_types.LENA_ALL_SPEECH] += lena_awc utter_counts[container_types.TRANS_ALL_SPEECH] += utter_count trans_awc, lena_awc, utter_count = get_trans_awc(unnumbered_multi, lena_db, filename, exclude_angle=False) word_counts[container_types.TRANS_ALL_SPEECH] += trans_awc word_counts[container_types.LENA_ALL_SPEECH] += lena_awc utter_counts[container_types.TRANS_ALL_SPEECH] += utter_count lena_db.close() trans_avg_no_overlap = 0 trans_avg_all_speech = 0 lena_avg_no_overlap = 0 lena_avg_all_speech = 0 if utter_counts[container_types.TRANS_NO_OVERLAP] > 0: trans_avg_no_overlap = word_counts[container_types.TRANS_NO_OVERLAP] / float(utter_counts[container_types.TRANS_NO_OVERLAP]) #note: lena and transcriber measures have matching segments, so count is the same lena_avg_no_overlap = word_counts[container_types.LENA_NO_OVERLAP] / float(utter_counts[container_types.TRANS_NO_OVERLAP]) if utter_counts[container_types.TRANS_ALL_SPEECH] > 0: trans_avg_all_speech = word_counts[container_types.TRANS_ALL_SPEECH] / float(utter_counts[container_types.TRANS_ALL_SPEECH]) lena_avg_all_speech = word_counts[container_types.LENA_ALL_SPEECH] / float(utter_counts[container_types.TRANS_ALL_SPEECH]) writer.writerow([ par_code, word_counts[container_types.TRANS_NO_OVERLAP], utter_counts[container_types.TRANS_NO_OVERLAP], '%0.3f' % (trans_avg_no_overlap), word_counts[container_types.TRANS_ALL_SPEECH], utter_counts[container_types.TRANS_ALL_SPEECH], '%0.3f' % (trans_avg_all_speech), word_counts[container_types.LENA_NO_OVERLAP], utter_counts[container_types.TRANS_NO_OVERLAP], '%0.3f' % (lena_avg_no_overlap), word_counts[container_types.LENA_ALL_SPEECH], utter_counts[container_types.TRANS_ALL_SPEECH], '%0.3f' % (lena_avg_all_speech), ])
def create_check(self): error_msg = self.validate_form() if error_msg: UIUtils.show_message_dialog(error_msg) else: filters = self.filters_frame.get_filters() check = Check( self.form.name_entry.get_text(), self.form.input_file_entry.get_text(), self.form.wav_file_entry.get_text(), self.form.num_segs_spinner.get_value_as_int(), self.form.context_pad_spinner.get_value_as_int(), [], 0, filters=filters, pick_randomly=self.form.rand_checkbox.get_active(), ) parser = None progress_dialog = ProgressDialog( title='Loading File', phases=['Parsing file...', 'Setting up...']) segs = [] #TRS files if check.input_filename.lower().endswith('.trs'): parser = TRSParser(check.input_filename) progress_dialog.show() segs = parser.parse( progress_update_fcn=progress_dialog.set_fraction, progress_next_phase_fcn=progress_dialog.next_phase, validate=False, seg_filters=check.filters) #CSV files else: parser = CSVParser(check.input_filename) progress_dialog.show() segs = parser.parse( progress_update_fcn=progress_dialog.set_fraction, seg_filters=check.filters) progress_dialog.next_phase() if check.pick_randomly: #segs = ParserTools.pick_rand_segs(check.num_segs, segs) segs = ParserTools.hacked_pick_rand_segs( check.num_segs, segs, os.path.basename(check.input_filename)) else: segs = ParserTools.pick_contiguous_segs(check.num_segs, segs) progress_dialog.set_fraction(1.0) if len(segs) < check.num_segs: progress_dialog.ensure_finish( ) #close the progress bar (even though there's still one phase left) UIUtils.show_message_dialog( 'The input file does not contain enough segments of the specified types.', dialog_type=gtk.MessageType.ERROR) else: db = BLLDatabase() check.db_insert(db) for i in range(len(segs)): if segs[i].db_id == None: segs[i].db_insert(db) test = Test( check.db_id, None, None, None, segs[i], None, check.default_context_padding, ) test.db_insert(db) check.tests.append(test) progress_dialog.set_fraction( float(i + 1) / float(check.num_segs)) db.close() progress_dialog.ensure_finish() self.window.destroy() TestWindow(check)