Example #1
0
    def onAnalyze(self):
        self.morphemizer = getMorphemizerByName(p['DEFAULT']['currmophemizer'])
        input_path = False  # will set
        self.writeOutput('Using morphemizer: %s \n' %
                         self.morphemizer.getDescription())
        debug_output = False

        if p['DEFAULT'].getboolean('inputtype') and ~p['DEFAULT'].getboolean(
                'minimized'
        ):  # only uses fold when not minimized and inputtype is checked
            #TODO if certain keypress, force analyze through clipboard
            input_path = p['DEFAULT']['inputpath']
        minimum_master_frequency = p['DEFAULT'].getint('min_master_freq')
        readability_target = p['DEFAULT'].getfloat('read_target')
        master_freq_path = p['DEFAULT']['frequencylist']
        known_words_path = p['DEFAULT']['knownmorphs']
        ext_morphs = p['DEFAULT']['externalmorphs']

        output_path = p['DEFAULT']['outputpath']

        save_frequency_list = p['DEFAULT'].getboolean('save_freqency_list')
        save_word_report = p['DEFAULT'].getboolean('save_word_report')
        save_study_plan = p['DEFAULT'].getboolean('save_study_plan')

        source_score_multiplier = p['DEFAULT'].getfloat(
            'SourceScoreMultiplier')
        source_score_power = p['DEFAULT'].getfloat('SourceScorePower')

        proper_nouns_known = p['DEFAULT'].getboolean('ProperNounsAlreadyKnown')
        fill_all_morphs_in_plan = p['DEFAULT'].getboolean(
            'FillAllMorphsInStudyPlan')

        if not os.path.exists(output_path):
            try:
                os.makedirs(output_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

        frequency_list_path = os.path.normpath(output_path + '/frequency.txt')
        word_report_path = os.path.normpath(output_path +
                                            '/word_freq_report.txt')
        study_plan_path = os.path.normpath(output_path + '/study_plan.txt')
        readability_log_path = os.path.normpath(output_path +
                                                '/readability_log.txt')

        log_fp = open(readability_log_path, 'wt', encoding='utf-8')

        master_db = MorphDb()
        unknown_db = MorphDb()

        master_total_instances = 0
        master_current_score = 0

        all_morphs = {}

        if os.path.isfile(master_freq_path):
            with io.open(master_freq_path, encoding='utf-8-sig') as csvfile:
                csvreader = csv.reader(csvfile, delimiter="\t")
                for row in csvreader:
                    try:
                        instances = int(row[0])
                        m = Morpheme(row[1], row[2], row[2], row[3], row[4],
                                     row[5])

                        master_db.addMorph(m, instances)
                        master_total_instances += instances
                    except:
                        pass
            self.writeOutput("Master morphs loaded: K %d V %d\n" %
                             (master_db.getTotalNormMorphs(),
                              master_db.getTotalVariationMorphs()))
        else:
            self.writeOutput("Master frequency file '%s' not found.\n" %
                             master_freq_path)
            minimum_master_frequency = 0

        if os.path.isfile(known_words_path):
            known_db = MorphDb(known_words_path, ignoreErrors=True)

            total_k = len(known_db.groups)
            total_v = len(known_db.db)
            self.writeOutput("Known morphs loaded: K %d V %d\n" %
                             (total_k, total_v))
        else:
            self.writeOutput("Known words DB '%s' not found\n" %
                             known_words_path)
            known_db = MorphDb()
        self.known_db = known_db
        if master_total_instances > 0:
            master_current_score = 0
            for ms in master_db.db.values():
                for m, c in ms.items():
                    if known_db.matches(m):
                        master_current_score += c[0]
                        c[1] = True  # mark matched
            self.writeOutput(
                "\n[Current master frequency readability] %0.02f\n" %
                (master_current_score * 100.0 / master_total_instances))

        sources = []

        def measure_readability(self, file_name, is_ass, is_srt):
            self.writeOutput(
                '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                ("Input", "Total Morphs", "Known Morphs", "% Known Morphs",
                 "Total Instances", "Known Instances", "% Readability",
                 "% Proper Nouns", "% Known Lines", "% i+1 Lines"))

            #filename will be clipboard if reading from clipboard
            log_fp.write('measure_readability %s\n' % file_name)

            proper_noun_count = 0
            i_count = 0
            line_count = 0
            line_morphs = []
            known_line_count = 0
            iplus1_line_count = 0
            known_count = 0
            seen_morphs = {}
            known_morphs = {}
            source_unknown_db = MorphDb()

            def proc_lines(text, is_ass, is_srt):
                nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs
                nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs

                text_index = -1
                num_fields = 1
                srt_count = 0

                def parse_text(text):
                    nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs
                    nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs

                    log_fp.write('=== parse_text ===\n' + text + '\n')
                    # print('strip',stripHTML(text))
                    parsed_morphs = getMorphemes(self.morphemizer,
                                                 stripHTML(text))
                    # parsed_morphs = getMorphemes(morphemizer, text)
                    if len(parsed_morphs) == 0:
                        return

                    unknown_count = 0
                    line_missing_morphs = set()
                    for m in parsed_morphs:
                        # Count morph for word report
                        all_morphs[m] = all_morphs.get(m, 0) + 1
                        seen_morphs[m] = seen_morphs.get(m, 0) + 1

                        if m.isProperNoun():
                            proper_noun_count += 1
                            is_proper_noun = True
                        else:
                            is_proper_noun = False

                        i_count += 1
                        if known_db.matches(
                                m
                        ) or is_proper_noun:  # Proper nouns are easy to learn, so assume they're known.
                            known_morphs[m] = known_morphs.get(m, 0) + 1
                            known_count += 1
                        else:
                            unknown_db.addMorph(m, 1)
                            source_unknown_db.addMorph(m, 1)
                            line_missing_morphs.add(m)
                            unknown_count += 1
                    line_count += 1
                    if unknown_count == 0:
                        known_line_count += 1
                    elif unknown_count == 1:
                        iplus1_line_count += 1
                    line_morphs.append(line_missing_morphs)

                filtered_text = ''
                for t in text.splitlines():
                    should_flush = True
                    if is_ass:
                        if 'Format:' in t:
                            formats = [x.strip() for x in t[8:].split(',')]
                            if 'Text' in formats:
                                text_index = formats.index('Text')
                                num_fields = len(formats)
                            else:
                                text_index = -1
                            continue
                        elif ('Dialogue:' not in t) or (text_index < 0):
                            continue
                        t = t[9:].split(',', num_fields - 1)
                        t = t[text_index]
                    elif is_srt:
                        srt_count += 1
                        if srt_count <= 2:
                            continue
                        elif t == '':
                            srt_count = 0
                        else:
                            should_flush = False

                    if t != '':
                        filtered_text += t + '\n'

                    # Todo: This will flush every line so we can compute per-line readability, which is slower than batching lines.
                    #       Figure out how to get per-line analysis with batched lines.
                    if should_flush:
                        #if len(filtered_text) >= 2048:
                        parse_text(filtered_text)
                        filtered_text = ''

                parse_text(filtered_text)

            try:
                if file_name == 'clipboard':
                    input = pyperclip.paste()
                else:
                    with open(file_name.strip(), 'rt', encoding='utf-8') as f:
                        input = f.read()

                input = input.replace(u'\ufeff', '')

                #input = [l.replace(u'\ufeff', '') for l in f.read()]
                proc_lines(input, is_ass, is_srt)
                source = Source(file_name, seen_morphs, line_morphs,
                                source_unknown_db)
                known_percent = 0.0 if len(
                    seen_morphs.keys()
                ) == 0 else 100.0 * len(known_morphs) / len(seen_morphs.keys())
                readability = 0.0 if i_count == 0 else 100.0 * known_count / i_count
                proper_noun_percent = 0.0 if line_count == 0 else 100.0 * proper_noun_count / i_count
                line_percent = 0.0 if line_count == 0 else 100.0 * known_line_count / line_count
                iplus1_percent = 0.0 if line_count == 0 else 100.0 * iplus1_line_count / line_count

                self.writeOutput(
                    '%s\t%d\t%d\t%0.2f\t%d\t%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f\n' %
                    (source.name, len(seen_morphs), len(known_morphs),
                     known_percent, i_count, known_count, readability,
                     proper_noun_percent, line_percent, iplus1_percent))
                # row = self.ui.readabilityTable.rowCount()
                # self.ui.readabilityTable.insertRow(row)
                # self.ui.readabilityTable.setItem(row, 0, QTableWidgetItem(source.name))
                # self.ui.readabilityTable.setItem(row, 1, TableInteger(len(seen_morphs)))
                # self.ui.readabilityTable.setItem(row, 2, TableInteger(len(known_morphs)))
                # self.ui.readabilityTable.setItem(row, 3, TablePercent(known_percent))
                # self.ui.readabilityTable.setItem(row, 4, TableInteger(i_count))
                # self.ui.readabilityTable.setItem(row, 5, TableInteger(known_count))
                # self.ui.readabilityTable.setItem(row, 6, TablePercent(readability))
                # self.ui.readabilityTable.setItem(row, 7, TablePercent(proper_noun_percent))
                # self.ui.readabilityTable.setItem(row, 8, TablePercent(line_percent))
                # self.ui.readabilityTable.setItem(row, 9, TablePercent(iplus1_percent))

                if save_study_plan:
                    sources.append(source)
            except:
                self.writeOutput("Failed to process '%s'\n" % file_name)
                raise

        def accepted_filetype(filename):
            return filename.lower().endswith(('.srt', '.ass', '.txt'))

        list_of_files = None
        ####################

        if os.path.isfile(input_path) or os.path.isdir(input_path):
            list_of_files = list()
            print('getting info from files!')
        ###################

        if list_of_files is not list():

            for (dirpath, _, filenames) in os.walk(input_path):
                list_of_files += [
                    os.path.join(dirpath, filename) for filename in filenames
                    if accepted_filetype(filename)
                ]

            # self.ui.readabilityTable.clear()
            # self.ui.readabilityTable.setRowCount(0)
            # self.ui.readabilityTable.setColumnCount(10)
            # self.ui.readabilityTable.setHorizontalHeaderLabels([
            #     "Input", "Total\nMorphs", "Known\nMorphs", "Known\nMorphs %", "Total\nInstances", "Known\nInstances",
            #     "Morph\nReadability %", "Proper\nNoun %", "Line\nReadability %", "i+1\nLines %"])

            if len(list_of_files) > 0:

                #     mw.progress.start( label='Measuring readability', max=len(list_of_files), immediate=True )
                for n, file_path in enumerate(
                        sorted(list_of_files, key=natural_keys)):
                    #         mw.progress.update(value=n, label='Parsing (%d/%d) %s' % (
                    #             n + 1, len(list_of_files), os.path.basename(file_path)))
                    #TODO ADD PROGRESS BAR
                    if os.path.isfile(file_path):
                        is_ass = os.path.splitext(
                            file_path)[1].lower() == '.ass'
                        is_srt = os.path.splitext(
                            file_path)[1].lower() == '.srt'
                        measure_readability(self, file_path, is_ass, is_srt)
            #     mw.progress.finish()
            else:
                self.writeOutput('\nNo files found to process.\n')
                return
        else:
            measure_readability(self, 'clipboard', 0, 0)  # for clipboard run
        # self.ui.readabilityTable.resizeColumnsToContents()

        if save_word_report:
            self.writeOutput("\n[Saving word report to '%s'...]\n" %
                             word_report_path)
            with open(word_report_path, 'wt', encoding='utf-8') as f:
                last_count = 0
                morph_idx = 0
                group_idx = 0
                morph_total = 0.0
                all_morphs_count = sum(n for n in all_morphs.values())

                for m in sorted(all_morphs.items(),
                                key=operator.itemgetter(1),
                                reverse=True):
                    if m[1] != last_count:
                        last_count = m[1]
                        group_idx += 1
                    morph_idx += 1
                    morph_delta = 100.0 * m[1] / all_morphs_count
                    morph_total += morph_delta
                    print(
                        '%d\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%0.8f\t%0.8f matches %d'
                        % (m[1], m[0].norm, m[0].base, m[0].read, m[0].pos,
                           m[0].subPos, group_idx, morph_idx, morph_delta,
                           morph_total, known_db.matches(m[0])),
                        file=f)

        learned_tot = 0
        learned_morphs = []

        all_missing_morphs = []

        def get_line_readability(show, known_db):
            known_lines = 0
            for line_morphs in show.line_morphs:
                has_unknowns = False
                for m in line_morphs:
                    if known_db.matches(m):
                        continue
                    has_unknowns = True
                if not has_unknowns:
                    known_lines += 1
            line_readability = 0.0 if known_lines == 0 else 100.0 * known_lines / len(
                show.line_morphs)
            return line_readability

        if save_study_plan:
            self.writeOutput("\n[Saving Study Plan to '%s'...]\n" %
                             study_plan_path)
            with open(study_plan_path, 'wt', encoding='utf-8') as f:
                # self.ui.studyPlanTable.clear()
                # self.ui.studyPlanTable.setRowCount(0)
                # self.ui.studyPlanTable.setColumnCount(7)
                # self.ui.studyPlanTable.setHorizontalHeaderLabels([
                #     "Input", "To Study\nMorphs ", "Cummulative\nMorphs", "Old Morph\nReadability %", "New Morph\nReadability %",
                #     "Old Line\nReadability %", "New Line\nReadability %"])

                # mw.progress.start( label='Building study plan', max=len(sources), immediate=True )

                for n, s in enumerate(sources):
                    # mw.progress.update( value=n, label='Processing (%d/%d) %s' % (n+1, len(sources), os.path.basename(s.name)) )
                    # if debug_output: f.write('Processing %s\n' % s.name)

                    known_i = 0
                    seen_i = 0
                    learned_m = 0
                    missing_morphs = []

                    old_line_readability = get_line_readability(s, known_db)

                    for m in s.morphs.items():
                        seen_i += m[1]
                        morph = m[0]
                        if known_db.matches(morph) or (proper_nouns_known and
                                                       morph.isProperNoun()):
                            known_i += m[1]
                        else:
                            source_unknown_count = s.unknown_db.getFuzzyCount(
                                morph, known_db)
                            unknown_count = unknown_db.getFuzzyCount(
                                morph, known_db)
                            master_count = master_db.getFuzzyCount(
                                morph, known_db)
                            source_count = source_unknown_count + unknown_count

                            score = pow(
                                source_count, source_score_power
                            ) * source_score_multiplier + master_count
                            missing_morphs.append(
                                (m[0], m[1], source_unknown_count,
                                 unknown_count, master_count, score))

                            if debug_output:
                                f.write(
                                    '  missing: ' + m[0].show() +
                                    '\t[score %d ep_freq %d all_freq %d master_freq %d]\n'
                                    % (score, source_unknown_count,
                                       unknown_count, master_count))

                    all_missing_morphs += missing_morphs
                    readability = 100.0 if seen_i == 0 else known_i * 100.0 / seen_i
                    old_readability = readability

                    learned_this_source = []

                    for m in sorted(missing_morphs,
                                    key=operator.itemgetter(5),
                                    reverse=True):
                        if readability >= readability_target:
                            if debug_output:
                                f.write('  readability target reached\n')
                            break

                        if known_db.matches(m[0]):
                            if debug_output:
                                f.write('  known: %s\n' % m[0].show())
                            continue

                        if m[4] < minimum_master_frequency:
                            if debug_output:
                                f.write(
                                    '  low score: %s [score %d ep_freq %d all_freq %d master_freq %d]\n'
                                    % (m[0].show(), m[5], m[2], m[3], m[4]))
                            continue

                        learned_morphs.append(m)
                        learned_this_source.append(m)
                        known_i += s.unknown_db.getFuzzyCount(m[0], known_db)
                        learned_m += 1
                        readability = 100.0 if seen_i == 0 else known_i * 100.0 / seen_i
                        known_db.addMLs1(m[0], set())

                    new_line_readability = get_line_readability(s, known_db)

                    learned_tot += learned_m
                    source_str = "'%s' study goal: (%3d/%4d) morph readability: %0.2f -> %0.2f line readabiltiy: %0.2f -> %0.2f\n" % (
                        s.name, learned_m, learned_tot, old_readability,
                        readability, old_line_readability,
                        new_line_readability)
                    self.writeOutput(source_str)
                    f.write(source_str)

                    # row = self.ui.studyPlanTable.rowCount()
                    # self.ui.studyPlanTable.insertRow(row)
                    # self.ui.studyPlanTable.setItem(row, 0, QTableWidgetItem(s.name))
                    # self.ui.studyPlanTable.setItem(row, 1, TableInteger(learned_m))
                    # self.ui.studyPlanTable.setItem(row, 2, TableInteger(learned_tot))
                    # self.ui.studyPlanTable.setItem(row, 3, TablePercent(old_readability))
                    # self.ui.studyPlanTable.setItem(row, 4, TablePercent(readability))
                    # self.ui.studyPlanTable.setItem(row, 5, TablePercent(old_line_readability))
                    # self.ui.studyPlanTable.setItem(row, 6, TablePercent(new_line_readability))

                    for m in learned_this_source:
                        f.write(
                            '\t' + m[0].show() +
                            '\t[score %d ep_freq %d all_freq %d master_freq %d]\n'
                            % (m[5], m[2], m[3], m[4]))

                # self.ui.studyPlanTable.resizeColumnsToContents()
                # mw.progress.finish()

                if save_frequency_list:
                    self.writeOutput("\n[Saving frequency list to '%s'...]\n" %
                                     frequency_list_path)
                    with open(frequency_list_path, 'wt',
                              encoding='utf-8') as f:
                        unique_set = set()
                        # First output morphs according to the plan.
                        for m in learned_morphs:
                            if m[0].base in unique_set:
                                continue
                            unique_set.add(m[0].base)
                            print(
                                m[0].base +
                                '\t[score %d ep_freq %d all_freq %d master_freq %d]'
                                % (m[5], m[2], m[3], m[4]),
                                file=f)

                        # Followed by all remaining morphs sorted by score.
                        if fill_all_morphs_in_plan:
                            for m in sorted(all_missing_morphs,
                                            key=operator.itemgetter(5),
                                            reverse=True):
                                if (m[0].base in unique_set):
                                    continue
                                if m[4] < minimum_master_frequency:
                                    continue
                                unique_set.add(m[0].base)
                                print(
                                    m[0].base +
                                    '\t[score %d ep_freq %d all_freq %d master_freq %d]'
                                    % (m[5], m[2], m[3], m[4]),
                                    file=f)

                if master_total_instances > 0:
                    master_score = 0
                    for ms in master_db.db.values():
                        for m, c in ms.items():
                            if known_db.matches(m):
                                master_score += c[0]
                                c[1] = True  # mark matched
                    self.writeOutput(
                        "\n[New master frequency readability] %0.02f -> %0.02f\n"
                        %
                        (master_current_score * 100.0 / master_total_instances,
                         master_score * 100.0 / master_total_instances))