def read_chars(self): dramas = self.db.get_all_dramas() with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: while len(dramas) > 0: try: futures = {} for drama in dramas: if drama.kanji_ok is 1: dramas.remove(drama) print("kanji_ok TRUE -> {} skipped".format( drama.uid)) continue futures[drama] = executor.submit( self.read_chars_worker, drama) dramas.remove(drama) if len(futures) > 10: break for future in concurrent.futures.as_completed( futures.values()): chars = future.result() self.db.push_chars_count(chars) except Error as e: exception(e) self.db.push_chars()
def __cursor_execute_thread_safe(self, sql): with JdsDatabase.__lock: if settings.print_sql: print(sql) cur_start_time = time.perf_counter() try: self.__cursor.execute(sql) self.__db.commit() except Error as e: # try again try: self.__db.reconnect(3, 3) if self.__check_state(): self.__cursor.execute(sql) self.__db.commit() else: print("Could not reconnect to server") except Error as e: exception(e) return False if settings.print_sql: print("in {:2.2f}".format(time.perf_counter() - cur_start_time)) return True
def line_ref_worker(self, drama): """ threaded worker that build references of characters with lines. requires drama,lines to be in the DB beforehand :param drama: :return: """ lines = {} # key = char, value = [] of line_uid jds_lines = self.db.get_lines_for_drama(drama) print("start line_ref_worker for {}".format(drama.value)) cur_start_time = time.perf_counter() for jds_line in jds_lines: for char in jds_line.value: try: if char not in lines: lines[char] = [] lines[char].append(jds_line.uid) except Exception as e: exception(e) jds_chars = {} for char in lines: new_char = JdsChar.from_drama(char, drama.uid) new_char.add_line_refs(lines[char][:10]) jds_chars[char] = new_char if "\n" in lines: del lines[JdsChar("\n")] print("Deleted \\n") run_time = time.perf_counter() - cur_start_time print("stop line_ref_worker for {} with {} chars in {}".format( drama.value, len(lines), run_time)) return jds_chars
def get_count_for_drama(self, drama): if not JdsDatabase.__check_state(): return sql = "SELECT * FROM count WHERE drama_uid={} ".format(drama.uid) results = self.__cursor_execute_fetchall_thread_safe(sql) res = {} try: for result in results: res[result['kanji_uid']] = result['count'] except Exception as e: exception(e) return res
def get_all_chars(self): if not JdsDatabase.__check_state(): return sql = "SELECT * FROM kanji " results = self.__cursor_execute_fetchall_thread_safe(sql) chars = {} try: for result in results: chars[result['kanji_uid']] = JdsChar(chr(result['kanji_uid'])) except Exception as e: exception(e) return chars
def get_lines_for_drama(self, drama): if not JdsDatabase.__check_state(): return sql = "SELECT * FROM line WHERE drama_uid={}".format(drama.uid) results = self.__cursor_execute_fetchall_thread_safe(sql) lines = [] try: for result in results: lines.append( JdsLine(result['line_uid'], result['drama_uid'], result['value'], result['episode_uid'])) except Exception as e: exception(e) return lines
def __cursor_execute_fetchone_thread_safe(self, sql): with JdsDatabase.__lock: if settings.print_sql: print(sql) cur_start_time = time.perf_counter() try: self.__cursor.execute(sql) except Error as e: try: self.__db.reconnect(3, 5) except Error as e: exception(e) if settings.print_sql: print("in {:2.2f}".format(time.perf_counter() - cur_start_time)) return JdsDatabase.__cursor.fetchone()
def get_all_lines_by_drama(self): if not JdsDatabase.__check_state(): return sql = "SELECT * FROM line " results = self.__cursor_execute_fetchall_thread_safe(sql) lines_by_drama = {} try: for result in results: if result['drama_uid'] not in lines_by_drama: lines_by_drama[result['drama_uid']] = [] lines_by_drama[result['drama_uid']].append( JdsLine(result['line_uid'], result['drama_uid'], result['value'], result['episode_uid'])) except Exception as e: exception(e) return lines_by_drama
def get_all_chars_with_count(self): if not JdsDatabase.__check_state(): return sql = """ SELECT a.value, a.kanji_uid, b.count FROM kanji a INNER JOIN count b ON a.kanji_uid = b.kanji_uid WHERE b.drama_uid = 0 """ results = self.__cursor_execute_fetchall_thread_safe(sql) chars = {} try: for result in results: c = JdsChar(chr(result['kanji_uid'])) c.set_count(result['count']) chars[result['kanji_uid']] = c except Exception as e: exception(e) return chars
def read_chars_worker(self, drama): """ threaded worker that counts all characters for a given drama, by getting all lines from the DB and counting the char. requires drama,lines to be in the DB beforehand :param drama: :return: """ chars = {} # key = char, value = count episodes = {} print("start read_chars_worker for {}".format(drama.value)) jds_lines = self.db.get_lines_for_drama(drama) cur_start_time = time.perf_counter() for jds_line in jds_lines: try: for char in jds_line.value: if char not in chars: chars[char] = 0 episodes[char] = set() chars[char] = chars[char] + 1 if jds_line.episode_uid not in episodes[char]: episodes[char].add(jds_line.episode_uid) except Exception as e: exception(e) jds_chars = {} for char in chars: new_char = JdsChar.from_drama(char, drama.uid) new_char.set_count(chars[char]) new_char.episode_count = len(episodes[char]) jds_chars[char] = new_char if "\n" in chars: del chars[JdsChar("\n")] print("Deleted \\n") run_time = time.perf_counter() - cur_start_time print("stop read_chars_worker for {} with {} chars in {:2.2f}".format( drama.value, len(chars), run_time)) return jds_chars
def read_lines(self): line_id = 0 subfolders = DccUtils.get_subfolders(self.args["path"]) with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: try: while len(subfolders) > 0: futures = {} for subfolder in subfolders: futures[subfolder] = executor.submit( self.line_ref_worker, subfolder) subfolders.remove(subfolder) if len(futures) > 15: break for future in concurrent.futures.as_completed( futures.values()): lines = future.result() for line in lines: line.uid = line_id line_id += 1 self.db.push_lines(lines) except Error as e: exception(e)
def main(argv): args = parse_args(argv) path = args["path"] ans = input( "WARNING: this will remove all non-readable lines in ALL files in folder & nested folders in {}\n Continue? (y/n)" .format(path)) while ans != "y" and ans != "n": ans = input( "WARNING: this will remove all non-readable lines in ALL files in folder & nested folders in {}\n Continue? (y/n)" .format(path)) if ans == "n": print("CleanSubtitles canceled") return subfolders = get_subfolders(path) for subfolder in subfolders: for filepath in get_files(subfolder): print("doing {}".format(filepath)) with open(filepath, 'r', encoding='utf-8') as file_r: try: lines = file_r.readlines() except Exception as e: exception(e) continue if lines: with open(filepath, 'w', encoding='utf-8') as file_w: for line in lines: if re.search( "[一-龠]+|[ぁ-ゔ]+|[ァ-ヴー]+|[a-zA-Z]+|[a-zA-Z]+|[々〆〤]+", line): file_w.write(line) else: pass