Ejemplo n.º 1
0
def process_dir(path, env, par_code, writer):
    trs_filenames = glob.glob('%s*.trs' % (path))

    trans_count = 0
    lena_count = 0

    lena_db = Database('%s%s.db' % (lena_db_path, env))

    for filename in trs_filenames:
        print '\n\tProcessing file %s' % (os.path.basename(filename))

        parser = TRSParser(filename)
        segs = get_trans_segs(parser.parse())

        if segs:
            zone_start = segs[0].start
            zone_end = segs[-1].end
            print '\tExamining range: %s (%0.2f) - %s (%0.2f)' % (get_time_str(
                zone_start), zone_start, get_time_str(zone_end), zone_end)

            trans_count += get_trans_child_vocs(segs)
            lena_count += get_lena_child_vocs(lena_db, filename, zone_start,
                                              zone_end)

    lena_db.close()

    writer.writerow([par_code, trans_count, lena_count])
Ejemplo n.º 2
0
    def __init__(self, filename, progress_dialog):
        self.logger = logging.getLogger(__name__)
        self.window = gtk.Window(gtk.WindowType.TOPLEVEL)
        self.window.set_title('Transcription Verifier')
        self.window.connect('destroy', lambda x: self.window.destroy())
        self.window.set_border_width(10)
        self.window.set_default_size(580, 500)

        self.trs_parser = TRSParser(filename)
        self.trs_parser.parse(
            progress_update_fcn=progress_dialog.set_fraction,
            progress_next_phase_fcn=progress_dialog.next_phase,
            remove_bad_trans_codes=False)
        self.wav_parser = None

        progress_dialog.next_phase()
        self.filter_errors = True
        self.toolbar = self.build_toolbar()
        self.treeview = self.build_treeview(progress_dialog.set_fraction)
        self.treeview.expand_all()

        scrolled_win = gtk.ScrolledWindow()
        scrolled_win.set_policy(gtk.PolicyType.AUTOMATIC,
                                gtk.PolicyType.AUTOMATIC)
        scrolled_win.add(self.treeview)

        vbox = gtk.VBox(False, 2)
        vbox.pack_start(self.toolbar, False, False, 0)
        vbox.pack_start(scrolled_win, True, True, 0)

        self.window.add(vbox)

        self.window.show_all()
Ejemplo n.º 3
0
    def _compare(self, file1_path, file2_path, file1_name, file2_name):
        self.window.set_sensitive(False)
        paths = [file1_path, file2_path]
        segs = []
        dialog = ProgressDialog(
            'Processing Files...',
            ['Parsing trs file %d...' % (i + 1) for i in range(len(paths))] +
            ['Comparing files...', 'Generating output...'])
        dialog.show()

        for i in range(len(paths)):
            file_segs = TRSParser(paths[i]).parse(
                progress_update_fcn=dialog.set_fraction,
                validate=False,
                remove_bad_trans_codes=False)
            segs.append(file_segs)

            dialog.next_phase()

        desc_strs = self._build_desc_strs(segs, dialog)
        dialog.next_phase()

        html = difflib.HtmlDiff().make_file(*desc_strs,
                                            fromdesc=file1_name,
                                            todesc=file2_name,
                                            context=True,
                                            numlines=0)

        #prevent font selection from killing webkit on Windows systems
        html = html.replace('font-family:Courier;', '')
        DiffWin(html)

        dialog.ensure_finish()

        self.window.destroy()
Ejemplo n.º 4
0
def process_dir(full_item_path, output_dir):
    trs_filenames = glob.glob('%s*.trs' % (full_item_path))
    matrix = build_matrix(TRANS_CODES, LENA_CODES)

    for filename in trs_filenames:
        print '\n\tProcessing file %s' % (os.path.basename(filename))

        parser = TRSParser(filename)
        segs = get_trans_segs( parser.parse() )
        if segs:
            print '\tExamining range: %s - %s' % (get_time_str(segs[0].start), get_time_str(segs[-1].end))

            sm = StateMachine()
            single, numbered_multi, unnumbered_multi = sm.divide_segs(segs, use_lena_segmentation=False)

            count_single = len(single)
            count_numbered_multi = len(numbered_multi)
            count_unnumbered_multi = len(unnumbered_multi)
            count_angle_brackets = count_angle_bracket_segs(single) + count_angle_bracket_segs(numbered_multi) + count_angle_bracket_segs(unnumbered_multi)

            process_single(single, matrix)
            process_numbered_multi(numbered_multi, matrix)
            process_unnumbered_multi(unnumbered_multi, matrix)

    output_name = '%s%s-matrix.csv' % (output_dir, full_item_path.split('/')[-2])
    output_matrix(matrix, output_name, count_single, count_numbered_multi, count_unnumbered_multi, count_angle_brackets)
Ejemplo n.º 5
0
def populate_segs(db, trs_folder):
    print 'Populating db using trs files...'

    trs_filenames = glob.glob('%s*.trs' % (trs_folder))
    trs_filenames.extend(glob.glob('%s*.its' % (trs_folder)))

    for i in range(len(trs_filenames)):
        print 'File %d of %d' % (i + 1, len(trs_filenames))

        is_trs = trs_filenames[i].endswith('.trs')

        segs = None
        if is_trs:
            trs_parser = TRSParser(trs_filenames[i])
            segs = trs_parser.parse(validate=False)

        else:
            trs_parser = ITSParser(trs_filenames[i])
            segs = trs_parser.parse()

            #<hack for its files>
            utters = []
            for s in segs:
                utters.extend(s.utters)
            segs = utters
            #</hack for its files>

        file_cd = os.path.basename(trs_filenames[i][:-4]).upper()

        for cur_seg in segs:
            #commented out for .its hack
            if is_trs:
                is_fuz = False
                j = 0
                while not is_fuz and j < len(cur_seg.speakers):
                    is_fuz = cur_seg.speakers[j].speaker_codeinfo.code == 'FUZ'
                    j += 1

                if is_fuz and len(cur_seg.speakers) > 1:
                    print 'Warning: Found multi-speaker FUZ seg in file: "%s"' % (
                        os.path.basename(trs_filenames[i]))

            db.insert(
                'trs_segs',
                'file_cd start end speaker'.split(),
                (
                    (
                        file_cd,
                        cur_seg.start,
                        cur_seg.end,
                        #commented out for its hack
                        cur_seg.speakers[0].speaker_codeinfo.code if is_trs
                        else cur_seg.speaker.speaker_codeinfo.code), ))

    print 'done.\n'
Ejemplo n.º 6
0
def run():
    LOGFILE = 'logs/confusion.log'
    
    #create log file if it doesn't exist
    check_log_file(LOGFILE)
    #set up logging
    logging.basicConfig(level=logging.ERROR,
                        filename=LOGFILE,
                        format='%(asctime)s %(message)s') #prefix each message with a timestamp

    for cur_env in envs:
        print 'Processing environment: %s' % (cur_env)
        output_dir = '%s%s/' % (output_path, cur_env)
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        
        input_dir = '%s%s/' % (input_path, cur_env)
        dir_contents = os.listdir(input_dir)

        for item in dir_contents:
            full_item_path = '%s%s/' % (input_dir, item)

            if os.path.isdir(full_item_path):
                trs_filenames = glob.glob('%s*.trs' % (full_item_path))
                matrix = build_matrix(TRANS_CODES, LENA_CODES)

                for filename in trs_filenames:
                    print '\n\tProcessing file %s' % (os.path.basename(filename))

                    parser = TRSParser(filename)
                    segs = get_trans_segs( parser.parse() )
                    if segs:
                        print '\tExamining range: %s - %s' % (get_time_str(segs[0].start), get_time_str(segs[-1].end))

                        single, numbered_multi, unnumbered_multi = divide_segs(segs)

                        count_single = len(single)
                        count_numbered_multi = len(numbered_multi)
                        count_unnumbered_multi = len(unnumbered_multi)
                        count_angle_brackets = count_angle_bracket_segs(single) + count_angle_bracket_segs(numbered_multi) + count_angle_bracket_segs(unnumbered_multi)

                        process_single(single, matrix)
                        process_numbered_multi(numbered_multi, matrix)
                        process_unnumbered_multi(unnumbered_multi, matrix)

                output_name = '%s%s-matrix.csv' % (output_dir, full_item_path.split('/')[-2])
                output_matrix(matrix, output_name, count_single, count_numbered_multi, count_unnumbered_multi, count_angle_brackets)
Ejemplo n.º 7
0
def check_pair(csv_filename, trs_filename, speaker_cds):
    csv_file = open(csv_filename, 'rb')

    trs_segs = TRSParser(trs_filename).parse(validate=False)
    csv_reader = csv.DictReader(csv_file)
    csv_segs = list(csv_reader)
    csv_file.close()

    i = 0
    j = 0
    csv_start = float(csv_segs[i]['Elapsed_Time'])
    trs_start = trs_segs[j].start
    error_msg = ''
    misaligned = round(csv_start, 2) != round(trs_start, 2)
    if misaligned:
        error_msg = 'File start times are different.'
    
    while i < len(csv_segs) and j < len(trs_segs) and not misaligned:
        csv_end = csv_start + get_row_dur(csv_segs[i], speaker_cds)
        trs_start = trs_segs[j].start
        trs_end = trs_segs[j].end

        r_csv_start = round(csv_start, 2)
        r_trs_start = round(trs_start, 2)
        r_csv_end = round(csv_end, 2)
        r_trs_end = round(trs_end, 2)

        if r_csv_end == r_trs_end:
            i += 1
            j += 1
            csv_start = csv_end
            
        elif r_csv_end < r_trs_end:
            i += 1
            csv_start = csv_end
            
        elif r_csv_end > r_trs_end:
            misaligned = True
            error_msg = 'csv: %f - %f\ntrs: %f - %f' % (r_csv_start, r_csv_end, r_trs_start, r_trs_end)

    return misaligned, error_msg
Ejemplo n.º 8
0
    def __init__(self, filename, progress_dialog):
        self.window = gtk.Window(gtk.WindowType.TOPLEVEL)
        self.window.set_title('WH-Frequency Counter')
        self.window.set_border_width(10)
        self.window.set_default_size(730, 400)

        self.logger = logging.getLogger(__name__)

        self.count_cols = self._get_initial_count_cols()
        self.trs_parser = TRSParser(filename)
        segments = self.trs_parser.parse(
            progress_update_fcn=progress_dialog.set_fraction,
            progress_next_phase_fcn=progress_dialog.next_phase,
            validate=False,
            seg_filters=[])

        self.filter_manager = FilterManager(
            segments
        )  #this object caches original segs and helps with lookup by segment number
        calc = CountOutputCalc('', CountOutputCalc.COUNT_TYPES.PER_SEG, 1)
        self.output = Output(
            '', '', [WHQFilter()], calc, False
        )  #this object filters and allows us to retrieve the filtered segs
        map(lambda seg: self.output.add_item(seg), segments)

        treeview = self._build_treeview()
        #ensure progress dialog self-destructs even if no utterances are found (in that case the above call never invokes progress_dialog.set_fraction)
        progress_dialog.ensure_finish()

        scrolled_win = gtk.ScrolledWindow()
        scrolled_win.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
        scrolled_win.add(treeview)

        export_button = UIUtils.create_button('Export Results',
                                              UIUtils.BUTTON_ICONS.EXPORT)
        export_button.connect('clicked',
                              lambda widget: self._export_results(treeview))

        close_button = UIUtils.create_button('Close',
                                             UIUtils.BUTTON_ICONS.CLOSE)
        close_button.connect('clicked', lambda w: self.window.destroy())

        add_button = UIUtils.create_button('Add Count Column',
                                           UIUtils.BUTTON_ICONS.ADD)
        add_button.connect('clicked', lambda w: self._add_count_col(treeview))

        self.remove_button = UIUtils.create_button('Remove Count Column',
                                                   UIUtils.BUTTON_ICONS.REMOVE)
        self.remove_button.connect('clicked',
                                   lambda w: self._remove_count_col(treeview))
        self._update_remove_button_state()

        options_frame = gtk.Frame(label='Options')
        options_vbox = gtk.VBox()
        self.linked_checkbox = gtk.CheckButton('Group Linked Segments')
        self.linked_checkbox.connect('toggled', self._toggle_seg_grouping,
                                     treeview)
        options_vbox.pack_start(self.linked_checkbox, False, False, 0)

        self.context_checkbox = gtk.CheckButton('Show Context')
        self.context_checkbox.connect('toggled', self._toggle_show_context,
                                      treeview)
        options_vbox.pack_start(self.context_checkbox, False, False, 0)

        options_frame.add(options_vbox)

        self.statusbar = gtk.Statusbar()
        self.statusbar.set_has_resize_grip(False)
        self.num_whq = treeview.get_model().iter_n_children(None)
        self._update_statusbar()

        vbox = gtk.VBox()

        bbox = gtk.HButtonBox()
        bbox.pack_start(export_button, True, False, 0)
        bbox.pack_start(add_button, True, False, 0)
        bbox.pack_start(self.remove_button, True, False, 0)
        bbox.pack_start(close_button, True, False, 0)

        vbox.pack_start(scrolled_win, True, True, 0)
        vbox.pack_start(self.statusbar, False, False, 0)
        vbox.pack_end(bbox, False, False, 0)
        vbox.pack_end(options_frame, False, False, 0)
        self.window.add(vbox)

        self.window.show_all()
Ejemplo n.º 9
0
    def export(self, progress_update_fcn=None, progress_next_phase_fcn=None):
        #create csv file
        export_file = open(self.export_filename, 'wb')

        #write header info
        csv_writer = csv.writer(export_file,
                                quoting=csv.QUOTE_ALL)  #use Python csv library
        csv_writer.writerow(
            ['Export Date: %s' % (UIUtils.get_cur_timestamp_str())])
        csv_writer.writerow(
            ['Configuration Creation Date: %s' % (self.config.created)])
        csv_writer.writerow(['TRS Filename: %s' % (self.trs_filename)])
        csv_writer.writerow(['Output Configuration:'])
        csv_writer.writerow(['Name: %s' % (self.config.name)])
        csv_writer.writerow(['Description: %s' % (self.config.desc)])
        csv_writer.writerow([''])
        csv_writer.writerow(['Outputs:'])
        csv_writer.writerow([''])

        #parse the trs file
        trs_parser = TRSParser(self.trs_filename)
        segs = trs_parser.parse(progress_update_fcn,
                                progress_next_phase_fcn,
                                validate=False)
        chains = None  #this is populated on demand, then cached

        summary_row = [os.path.basename(self.trs_filename)[:-4]]
        summary_head = ["TRS file"]
        #iterate through all outputs in the configuration, adding segments/chains to each one, then writing the output to the spreadsheet file
        i = 0
        while i < len(self.config.outputs):
            #update progress bar text
            if progress_next_phase_fcn:
                progress_next_phase_fcn()

            cur_output = self.config.outputs[i]
            cur_output.reset()  #clear any cached utterances from previous runs

            #if we need chains, parse them from the segment list
            if cur_output.chained and not chains:
                chains = FilterManager.get_chains(segs)

            #add chains/segments to the current output
            items = chains if cur_output.chained else segs
            j = 0
            while j < len(items):
                cur_output.add_item(
                    items[j], filter_utters=True
                )  #note: filter_utters only affects segs (not chains)
                j += 1

            #note: updating progress individually within the above loop (for every iteration of j) slows down the processing considerably (by a factor of ~4) - a compromise is to just set each phase to 100% after it completes.
            if progress_update_fcn:
                progress_update_fcn(1)

            #grab the output's results and write them to the file
            cur_output.write_csv_rows(csv_writer)

            # get summary from output
            summary_head += [cur_output.name]
            summary_row += [cur_output.get_summary()]

            csv_writer.writerow([''])

            i += 1
        export_file.close()

        if len(self.summary_filename) > 0:
            need_head = False
            # check the existence of file, decide the header
            if not os.path.isfile(self.summary_filename):
                need_head = True
            with open(self.summary_filename, 'at') as fp:
                summary_writer = csv.writer(fp, quoting=csv.QUOTE_ALL)
                if need_head:
                    summary_writer.writerow(summary_head)
                summary_writer.writerow(summary_row)
Ejemplo n.º 10
0
def process_dir(path, env, par_code, writer):
    trs_filenames = glob.glob('%s*.trs' % (path))

    utter_counts = [0] * len(container_types)
    word_counts = [0] * len(container_types)

    lena_db = Database('%s%s.db' % (lena_db_path, env))

    for filename in trs_filenames:
        print '\n\tProcessing file %s' % (os.path.basename(filename))

        parser = TRSParser(filename)
        segs = get_trans_segs( parser.parse() )

        if segs:
            print '\tExamining range: %s (%0.2f) - %s (%0.2f)' % (get_time_str(segs[0].start), segs[0].start, get_time_str(segs[-1].end), segs[-1].end)

            sm = StateMachine()
            single, numbered_multi, unnumbered_multi = sm.divide_segs(segs)

            #for non-overlapping (no numbered_multi)
            trans_awc, lena_awc, utter_count = get_trans_awc(single, lena_db, filename, exclude_angle=True)
            word_counts[container_types.TRANS_NO_OVERLAP] += trans_awc
            word_counts[container_types.LENA_NO_OVERLAP] += lena_awc
            utter_counts[container_types.TRANS_NO_OVERLAP] += utter_count

            trans_awc, lena_awc, utter_count = get_trans_awc(unnumbered_multi, lena_db, filename, exclude_angle=True)
            word_counts[container_types.TRANS_NO_OVERLAP] += trans_awc
            word_counts[container_types.LENA_NO_OVERLAP] += lena_awc
            utter_counts[container_types.TRANS_NO_OVERLAP] += utter_count

            #for all speech
            trans_awc, lena_awc, utter_count = get_trans_awc(single, lena_db, filename, exclude_angle=False)
            word_counts[container_types.TRANS_ALL_SPEECH] += trans_awc
            word_counts[container_types.LENA_ALL_SPEECH] += lena_awc
            utter_counts[container_types.TRANS_ALL_SPEECH] += utter_count

            trans_awc, lena_awc, utter_count = get_trans_awc(numbered_multi, lena_db, filename, exclude_angle=False)
            word_counts[container_types.TRANS_ALL_SPEECH] += trans_awc
            word_counts[container_types.LENA_ALL_SPEECH] += lena_awc
            utter_counts[container_types.TRANS_ALL_SPEECH] += utter_count

            trans_awc, lena_awc, utter_count = get_trans_awc(unnumbered_multi, lena_db, filename, exclude_angle=False)
            word_counts[container_types.TRANS_ALL_SPEECH] += trans_awc
            word_counts[container_types.LENA_ALL_SPEECH] += lena_awc
            utter_counts[container_types.TRANS_ALL_SPEECH] += utter_count

    lena_db.close()
    trans_avg_no_overlap = 0
    trans_avg_all_speech = 0
    lena_avg_no_overlap = 0
    lena_avg_all_speech = 0

    if utter_counts[container_types.TRANS_NO_OVERLAP] > 0:
        trans_avg_no_overlap = word_counts[container_types.TRANS_NO_OVERLAP] / float(utter_counts[container_types.TRANS_NO_OVERLAP])
        #note: lena and transcriber measures have matching segments, so count is the same
        lena_avg_no_overlap = word_counts[container_types.LENA_NO_OVERLAP] / float(utter_counts[container_types.TRANS_NO_OVERLAP])
    if utter_counts[container_types.TRANS_ALL_SPEECH] > 0:
        trans_avg_all_speech = word_counts[container_types.TRANS_ALL_SPEECH] / float(utter_counts[container_types.TRANS_ALL_SPEECH])
        lena_avg_all_speech = word_counts[container_types.LENA_ALL_SPEECH] / float(utter_counts[container_types.TRANS_ALL_SPEECH])

    writer.writerow([
        par_code,
        word_counts[container_types.TRANS_NO_OVERLAP],
        utter_counts[container_types.TRANS_NO_OVERLAP],
        '%0.3f' % (trans_avg_no_overlap),
        word_counts[container_types.TRANS_ALL_SPEECH],
        utter_counts[container_types.TRANS_ALL_SPEECH],
        '%0.3f' % (trans_avg_all_speech),
        
        word_counts[container_types.LENA_NO_OVERLAP],
        utter_counts[container_types.TRANS_NO_OVERLAP],
        '%0.3f' % (lena_avg_no_overlap),
        word_counts[container_types.LENA_ALL_SPEECH],
        utter_counts[container_types.TRANS_ALL_SPEECH],
        '%0.3f' % (lena_avg_all_speech),
    ])
Ejemplo n.º 11
0
    def create_check(self):
        error_msg = self.validate_form()

        if error_msg:
            UIUtils.show_message_dialog(error_msg)

        else:
            filters = self.filters_frame.get_filters()

            check = Check(
                self.form.name_entry.get_text(),
                self.form.input_file_entry.get_text(),
                self.form.wav_file_entry.get_text(),
                self.form.num_segs_spinner.get_value_as_int(),
                self.form.context_pad_spinner.get_value_as_int(),
                [],
                0,
                filters=filters,
                pick_randomly=self.form.rand_checkbox.get_active(),
            )

            parser = None
            progress_dialog = ProgressDialog(
                title='Loading File',
                phases=['Parsing file...', 'Setting up...'])
            segs = []

            #TRS files
            if check.input_filename.lower().endswith('.trs'):
                parser = TRSParser(check.input_filename)
                progress_dialog.show()
                segs = parser.parse(
                    progress_update_fcn=progress_dialog.set_fraction,
                    progress_next_phase_fcn=progress_dialog.next_phase,
                    validate=False,
                    seg_filters=check.filters)

            #CSV files
            else:
                parser = CSVParser(check.input_filename)
                progress_dialog.show()
                segs = parser.parse(
                    progress_update_fcn=progress_dialog.set_fraction,
                    seg_filters=check.filters)

            progress_dialog.next_phase()

            if check.pick_randomly:
                #segs = ParserTools.pick_rand_segs(check.num_segs, segs)
                segs = ParserTools.hacked_pick_rand_segs(
                    check.num_segs, segs,
                    os.path.basename(check.input_filename))
            else:
                segs = ParserTools.pick_contiguous_segs(check.num_segs, segs)
            progress_dialog.set_fraction(1.0)

            if len(segs) < check.num_segs:
                progress_dialog.ensure_finish(
                )  #close the progress bar (even though there's still one phase left)
                UIUtils.show_message_dialog(
                    'The input file does not contain enough segments of the specified types.',
                    dialog_type=gtk.MessageType.ERROR)

            else:
                db = BLLDatabase()
                check.db_insert(db)

                for i in range(len(segs)):
                    if segs[i].db_id == None:
                        segs[i].db_insert(db)

                    test = Test(
                        check.db_id,
                        None,
                        None,
                        None,
                        segs[i],
                        None,
                        check.default_context_padding,
                    )
                    test.db_insert(db)
                    check.tests.append(test)

                    progress_dialog.set_fraction(
                        float(i + 1) / float(check.num_segs))

                db.close()
                progress_dialog.ensure_finish()

                self.window.destroy()
                TestWindow(check)