Example #1
0
class JdsDramaHandler:
    """
    Find all the drama in the given folder (i.e. top level subfolders), assign a uid and push then to the database
    """
    def __init__(self, argv):
        self.args = parse_args(argv)
        self.db = JdsDatabase()

    def reset(self):
        return self.db.reset_dramas()

    def read_dramas(self):
        subfolders = DccUtils.get_subfolders(self.args["path"])
        dramas = [self.db.get_merged_drama()]
        for subfolder in subfolders:
            dramas.append(JdsDrama(len(dramas), os.path.basename(subfolder)))
        self.db.push_dramas(dramas)

    def read_episodes(self):
        episodes = {}
        subfolders = DccUtils.get_subfolders(self.args["path"])
        for subfolder in subfolders:
            for filepath in DccUtils.get_files(subfolder):
                episodes[len(episodes)] = os.path.basename(filepath)
        self.db.push_episodes(episodes)
Example #2
0
 def __init__(self, argv):
     self.args = parse_args(argv)
     self.db = JdsDatabase()
Example #3
0
class JdsCharHandler:
    def __init__(self, argv):
        self.args = parse_args(argv)
        self.db = JdsDatabase()

    def reset(self):
        return self.db.reset_chars()

    def read_chars_worker(self, drama):
        """
        threaded worker that counts all characters for a given drama, by getting all lines from the DB and counting the char.
        requires drama,lines to be in the DB beforehand
        :param drama:
        :return:
        """
        chars = {}  # key = char, value = count
        episodes = {}

        print("start read_chars_worker for {}".format(drama.value))
        jds_lines = self.db.get_lines_for_drama(drama)
        cur_start_time = time.perf_counter()
        for jds_line in jds_lines:
            try:
                for char in jds_line.value:
                    if char not in chars:
                        chars[char] = 0
                        episodes[char] = set()
                    chars[char] = chars[char] + 1
                    if jds_line.episode_uid not in episodes[char]:
                        episodes[char].add(jds_line.episode_uid)
            except Exception as e:
                exception(e)

        jds_chars = {}
        for char in chars:
            new_char = JdsChar.from_drama(char, drama.uid)
            new_char.set_count(chars[char])
            new_char.episode_count = len(episodes[char])
            jds_chars[char] = new_char
        if "\n" in chars:
            del chars[JdsChar("\n")]
            print("Deleted \\n")
        run_time = time.perf_counter() - cur_start_time
        print("stop read_chars_worker for {} with {} chars in {:2.2f}".format(
            drama.value, len(chars), run_time))
        return jds_chars

    def read_chars(self):
        dramas = self.db.get_all_dramas()

        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            while len(dramas) > 0:
                try:
                    futures = {}
                    for drama in dramas:
                        if drama.kanji_ok is 1:
                            dramas.remove(drama)
                            print("kanji_ok TRUE -> {} skipped".format(
                                drama.uid))
                            continue

                        futures[drama] = executor.submit(
                            self.read_chars_worker, drama)
                        dramas.remove(drama)
                        if len(futures) > 10:
                            break
                    for future in concurrent.futures.as_completed(
                            futures.values()):
                        chars = future.result()
                        self.db.push_chars_count(chars)
                except Error as e:
                    exception(e)
        self.db.push_chars()

    def create_tables(self):
        self.db.create_char_tables()
Example #4
0
    def update_kanji_pos(self):

        # get all char count
        self.total_count = self.db.get_count_for_drama(
            JdsDatabase.get_merged_drama())

        # set jlpt position
        jlpt_dict = self.compute_pos_dict('jlpt')
        position = 1
        for level in range(len(jlpt_dict) - 1, 0, -1):
            for char in jlpt_dict[level]:
                char.jlpt_pos = position
                position += 1

        # set jouyou position
        jouyou_dict = self.compute_pos_dict('jouyou')
        position = 1
        for level in range(1, len(jouyou_dict), 1):
            for char in jouyou_dict[level]:
                char.jouyou_pos = position
                position += 1

        # set jdpt position
        char_per_level = {}
        for level in range(len(jlpt_dict) - 1, 0, -1):
            char_per_level[level] = len(jlpt_dict[level])

        # find sum of all kanji count
        sum_all_count = 0
        for char_uid in sorted(self.total_count,
                               key=self.total_count.get,
                               reverse=True):
            if is_kanji(self.chars[char_uid].value):
                sum_all_count += self.chars[char_uid].count()

        # set JDPT level
        position = 1
        cur_level = 6
        jdpt_limits = [1, 0.99, 0.98, 0.95, 0.9, 0.75, 0.5]
        cumul_freq = 0
        for char_uid in sorted(self.total_count,
                               key=self.total_count.get,
                               reverse=True):
            if is_kanji(self.chars[char_uid].value):
                count = self.total_count[char_uid]
                freq = count / sum_all_count
                self.chars[char_uid].jdpt_pos = position
                self.chars[char_uid].freq = freq
                self.chars[char_uid].cumul_freq = cumul_freq + freq
                cumul_freq += freq
                self.chars[char_uid].set_count(count)
                position += 1

                if cumul_freq > jdpt_limits[cur_level]:
                    cur_level -= 1
                if cumul_freq < jdpt_limits[cur_level]:
                    self.chars[char_uid].jdpt = cur_level - 1

        self.db.push_kanji_pos(self.chars)

        # set episode and drama frequency
        char_drama_count = {}
        char_episode_count = {}
        results = self.db.get_kanji_count_raw()
        for result in results:
            kanji_uid = result['kanji_uid']
            if kanji_uid not in char_drama_count:
                char_drama_count[kanji_uid] = 0
                char_episode_count[kanji_uid] = 0
            drama_uid = result['drama_uid']
            episode_count = result['episode_count']
            if drama_uid is 0:
                continue
            char_drama_count[kanji_uid] += 1
            char_episode_count[kanji_uid] += episode_count

        num_of_drama = len(
            self.db.get_all_dramas()) - 1  # -1 due to merge drama
        num_of_episodes = len(self.db.get_all_episodes_raw())
        char_drama_freq = {}
        char_episode_freq = {}

        for kanji_uid, count in char_drama_count.items():
            char_drama_freq[kanji_uid] = count / num_of_drama
        for kanji_uid, count in char_episode_count.items():
            char_episode_freq[kanji_uid] = count / num_of_episodes

        self.db.push_drama_and_episode_count(char_drama_freq,
                                             char_episode_freq)
Example #5
0
 def __init__(self, argv):
     self.args = parse_args(argv)
     self.db = JdsDatabase()
     self.chars = self.db.get_all_chars_with_count()
     self.total_count = None
Example #6
0
class JdsInfoHandler:
    def __init__(self, argv):
        self.args = parse_args(argv)
        self.db = JdsDatabase()
        self.chars = self.db.get_all_chars_with_count()
        self.total_count = None

    def reset(self):
        self.db.reset_info()
        self.db.prepare_info(self.chars)

    def update_jlpt_joyo(self):
        # read jlpt/joyou levels; count number of kanji per level at the same time
        jdpt_count = {}
        with open('jlpt_kanji.csv', mode='r', encoding='utf-8') as csv_file:
            for row in csv.reader(csv_file, delimiter=';'):
                # update kanji info
                uid = ord(row[0])
                jlpt = int(row[1])
                if uid not in self.chars:
                    new_char = JdsChar(chr(uid))
                    self.chars[uid] = new_char
                    self.db.push_char(new_char)
                self.chars[uid].jlpt = jlpt
                # update count of kanji per level
                if jlpt not in jdpt_count:
                    jdpt_count[jlpt] = 0
                jdpt_count[jlpt] += 1

        with open('jouyou_kanji.csv', mode='r', encoding='utf-8') as csv_file:
            for row in csv.reader(csv_file, delimiter=';'):
                uid = ord(row[0])
                jouyou = int(row[1])
                if uid not in self.chars:
                    new_char = JdsChar(chr(uid))
                    self.chars[uid] = new_char
                    self.db.push_char(new_char)
                self.chars[uid].jouyou = jouyou

        self.db.push_kanji_jlpt_joyo(self.chars)

    def compute_pos_dict(self, jlpt_or_jouyou):
        # sort chars by jlpt then by count
        my_dict = dict()

        for char in self.chars.values():
            if jlpt_or_jouyou is 'jlpt':
                if char.jlpt not in my_dict:
                    my_dict[char.jlpt] = []
                my_dict[char.jlpt].append(char)
            else:
                if char.jouyou not in my_dict:
                    my_dict[char.jouyou] = []
                my_dict[char.jouyou].append(char)

        for level in my_dict:
            my_dict[level].sort(key=methodcaller('count'), reverse=True)
        return my_dict

    def update_kanji_pos(self):

        # get all char count
        self.total_count = self.db.get_count_for_drama(
            JdsDatabase.get_merged_drama())

        # set jlpt position
        jlpt_dict = self.compute_pos_dict('jlpt')
        position = 1
        for level in range(len(jlpt_dict) - 1, 0, -1):
            for char in jlpt_dict[level]:
                char.jlpt_pos = position
                position += 1

        # set jouyou position
        jouyou_dict = self.compute_pos_dict('jouyou')
        position = 1
        for level in range(1, len(jouyou_dict), 1):
            for char in jouyou_dict[level]:
                char.jouyou_pos = position
                position += 1

        # set jdpt position
        char_per_level = {}
        for level in range(len(jlpt_dict) - 1, 0, -1):
            char_per_level[level] = len(jlpt_dict[level])

        # find sum of all kanji count
        sum_all_count = 0
        for char_uid in sorted(self.total_count,
                               key=self.total_count.get,
                               reverse=True):
            if is_kanji(self.chars[char_uid].value):
                sum_all_count += self.chars[char_uid].count()

        # set JDPT level
        position = 1
        cur_level = 6
        jdpt_limits = [1, 0.99, 0.98, 0.95, 0.9, 0.75, 0.5]
        cumul_freq = 0
        for char_uid in sorted(self.total_count,
                               key=self.total_count.get,
                               reverse=True):
            if is_kanji(self.chars[char_uid].value):
                count = self.total_count[char_uid]
                freq = count / sum_all_count
                self.chars[char_uid].jdpt_pos = position
                self.chars[char_uid].freq = freq
                self.chars[char_uid].cumul_freq = cumul_freq + freq
                cumul_freq += freq
                self.chars[char_uid].set_count(count)
                position += 1

                if cumul_freq > jdpt_limits[cur_level]:
                    cur_level -= 1
                if cumul_freq < jdpt_limits[cur_level]:
                    self.chars[char_uid].jdpt = cur_level - 1

        self.db.push_kanji_pos(self.chars)

        # set episode and drama frequency
        char_drama_count = {}
        char_episode_count = {}
        results = self.db.get_kanji_count_raw()
        for result in results:
            kanji_uid = result['kanji_uid']
            if kanji_uid not in char_drama_count:
                char_drama_count[kanji_uid] = 0
                char_episode_count[kanji_uid] = 0
            drama_uid = result['drama_uid']
            episode_count = result['episode_count']
            if drama_uid is 0:
                continue
            char_drama_count[kanji_uid] += 1
            char_episode_count[kanji_uid] += episode_count

        num_of_drama = len(
            self.db.get_all_dramas()) - 1  # -1 due to merge drama
        num_of_episodes = len(self.db.get_all_episodes_raw())
        char_drama_freq = {}
        char_episode_freq = {}

        for kanji_uid, count in char_drama_count.items():
            char_drama_freq[kanji_uid] = count / num_of_drama
        for kanji_uid, count in char_episode_count.items():
            char_episode_freq[kanji_uid] = count / num_of_episodes

        self.db.push_drama_and_episode_count(char_drama_freq,
                                             char_episode_freq)

    def update_kanji_flags(self):
        for char in self.chars.values():
            if is_kanji(char.value):
                char.flag = 1
            elif re.match("[ぁ-んァ-ン]", char.value):
                char.flag = 2
            else:
                char.flag = 3
        self.db.push_kanji_info_flags(self.chars)
Example #7
0
class JdsLineHandler:
    """
    Read all lines in the provided folder, assign them a unique uid and push the result in the database
    The drama must have been loaded to the database before (via JdsDramaHandler)
    """
    def __init__(self, argv):
        self.args = parse_args(argv)
        self.db = JdsDatabase()
        self.episode_to_uid = {}

    def reset(self):
        return self.db.reset_lines()

    def line_ref_worker(self, subfolder):
        lines = []
        drama = self.db.get_drama(os.path.basename(subfolder))
        print("read_lines for drama {}".format(drama.uid))
        for filepath in DccUtils.get_files(subfolder):
            filename = os.path.basename(filepath)
            with open(filepath, encoding='utf-8') as file:
                try:
                    for line in file.readlines():
                        try:
                            lines.append(
                                JdsLine(
                                    uid=0,
                                    drama_uid=drama.uid,
                                    value=line,
                                    episode_uid=self.episode_to_uid[filename]))
                        except Exception as e:
                            exception(e)
                except Exception as e:
                    exception(e)
        return lines

    def read_lines(self):
        line_id = 0
        subfolders = DccUtils.get_subfolders(self.args["path"])
        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            try:
                while len(subfolders) > 0:
                    futures = {}
                    for subfolder in subfolders:
                        futures[subfolder] = executor.submit(
                            self.line_ref_worker, subfolder)
                        subfolders.remove(subfolder)
                        if len(futures) > 15:
                            break

                    for future in concurrent.futures.as_completed(
                            futures.values()):
                        lines = future.result()
                        for line in lines:
                            line.uid = line_id
                            line_id += 1
                        self.db.push_lines(lines)
            except Error as e:
                exception(e)

    def setup(self):
        results = self.db.get_episodes_raw()
        for result in results:
            self.episode_to_uid[result['name']] = result['episode_uid']
class JdsLineRefHandler:
    def __init__(self, argv):
        self.args = parse_args(argv)
        self.db = JdsDatabase()

    def reset(self):
        return self.db.reset_line_refs()

    def line_ref_worker(self, drama):
        """
        threaded worker that build references of characters with lines.
        requires drama,lines to be in the DB beforehand
        :param drama:
        :return:
        """
        lines = {}  # key = char, value = [] of line_uid
        jds_lines = self.db.get_lines_for_drama(drama)
        print("start line_ref_worker for {}".format(drama.value))
        cur_start_time = time.perf_counter()
        for jds_line in jds_lines:
            for char in jds_line.value:
                try:
                    if char not in lines:
                        lines[char] = []
                    lines[char].append(jds_line.uid)
                except Exception as e:
                    exception(e)

        jds_chars = {}
        for char in lines:
            new_char = JdsChar.from_drama(char, drama.uid)
            new_char.add_line_refs(lines[char][:10])
            jds_chars[char] = new_char
        if "\n" in lines:
            del lines[JdsChar("\n")]
            print("Deleted \\n")
        run_time = time.perf_counter() - cur_start_time
        print("stop line_ref_worker for {} with {} chars in {}".format(
            drama.value, len(lines), run_time))
        return jds_chars

    def do_line_ref(self):
        dramas = self.db.get_all_dramas()

        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            while len(dramas) > 0:
                try:
                    futures = {}
                    for drama in dramas:
                        if drama.kanji_line_ref_ok is 1:
                            print(
                                "kanji_line_ref_ok TRUE -> {} skipped".format(
                                    drama.uid))
                            dramas.remove(drama)
                            continue

                        futures[drama] = executor.submit(
                            self.line_ref_worker, drama)
                        dramas.remove(drama)
                        if len(futures) > 15:
                            break
                    for future in concurrent.futures.as_completed(
                            futures.values()):
                        chars = future.result()
                        self.db.push_chars_to_line(chars)
                except Error as e:
                    exception(e)
Example #9
0
from python import settings
from python.DccUtils import parse_args
from python.JdsDatabase import JdsDatabase

if __name__ == "__main__":
    print("{} started".format(__file__))
    start_time = time.perf_counter()

    pr = None
    if settings.enable_profiler:
        pr = cProfile.Profile()
        pr.enable()

    args = parse_args(sys.argv[1:])
    db = JdsDatabase()
    kanji_info_results = db.get_kanji_info_raw()
    kanji_count_results = db.get_kanji_count_raw()

    with open(settings.csv_path_kanji, mode='w', encoding='utf-8',
              newline='') as csv_file:
        fieldnames = [
            'kanji', 'count', 'freq', 'cumul_freq', 'drama_freq',
            'episode_freq', 'jdpt', 'jdpt_pos', 'jouyou', 'jouyou_pos'
        ]
        writer = csv.DictWriter(csv_file,
                                fieldnames=fieldnames,
                                delimiter='\t')
        rows = {}
        writer.writeheader()
        for result in kanji_info_results: