def abbreviating(): remove_items = [] new_items = {} for k, v in trans_finder.master_dic.items(): ref_list = RefList(msg=v) new_v = ref_list.quotedToAbbrev(k) has_new_v = (new_v is not None) and (len(new_v) > 0) if has_new_v: new_entry = {k: new_v} new_items.update(new_entry) has_remove_items = (len(remove_items) > 0) if has_remove_items: for k in remove_items: dd(f'Delete from dictionary:[{k}]') del trans_finder.master_dic[k] is_writing_changes = (len(new_items) > 0) if is_writing_changes: trans_finder.master_dic.update(new_items) dic_file = '/Users/hoangduytran/blender_manual/test_dic.json' print( f'Writing changes to: {dic_file}, number of records:{len(new_items)}' ) trans_finder.writeJSONDic(dict_list=trans_finder.master_dic, file_name=dic_file)
def isIgnoredWord(text_line: str): if (text_line is None) or (len(text_line) == 0): return True is_create_runtime_ignore_list = (df.runtime_ignore_list == None) if is_create_runtime_ignore_list: df.runtime_ignore_list = [] for pattern in df.ignore_list: if len(pattern) == 0: continue m = re.compile(pattern, flags=re.I) df.runtime_ignore_list.append(m) pattern = None try: for m in df.runtime_ignore_list: is_found = (m.search(text_line) is not None) if is_found: dd(f'isIgnoredWord: pattern:[{m.pattern}] [{text_line}]') return True else: return False except Exception as e: df.LOG(f'{e}; text_line:[{text_line}]; pattern:[{pattern}]', error=True) return False
def isReverseOrder(msg): for w in Ignore.reverse_order_list: is_reverse = (re.search(w, msg, flags=re.I) is not None) if is_reverse: dd(f'isReverseOrder -> pattern:[{w}] msg:[{msg}]') return True return False
def isFilePath(text_line: str): if (text_line is None) or (len(text_line) == 0): return False has_path_characters = (df.PATH_CHAR.search(text_line) is not None) and ('kbd' not in text_line) #check to see if any word is title case, ie. Selected/Unselected, in which case it's not a PATH if has_path_characters: word_list = text_line.split(dirsep) word: str = None for word in word_list: is_title_case = (word.istitle()) if is_title_case: return False starts_with_path_chars = text_line.startswith('~') ends_with_extensions = (df.ENDS_WITH_EXTENSION.search(text_line) is not None) contain_spaces = (" " in text_line) is_path = (has_path_characters or starts_with_path_chars or ends_with_extensions) and not contain_spaces if is_path: dd("isFilePath", text_line) #exit(0) return is_path
def findByReduction(self, msg): def append_selective(cover_length, new_text_length, new_text, trans, selective_list, function_name): entry = (cover_length, new_text_length, new_text, trans, function_name) selective_list.append(entry) trans = None original_text = str(msg) selective_list = [] try: start_non_alpha, mid, end_non_alpha = cm.getTextWithin(msg) for f, params in self.tran_find_func_list: f_name = f.__name__ dd(f'findByReduction(): trying function:[{f_name}]') txt, param1, param2 = params is_empty = not (param1 or param2) if is_empty: new_text, trans, cover_length = f(msg) else: new_text, trans, cover_length = f(msg, param1, param2) new_text_length = len(new_text) # the least cut off the better append_selective(cover_length, new_text_length, new_text, trans, selective_list, f_name) sorted_selective_list = list(sorted(selective_list, key=OP.itemgetter(0, 1), reverse=True)) chosen_entry = sorted_selective_list[0] cover_length, new_text_length, new_text, trans, function_name = chosen_entry has_translation = (trans is not None) if not has_translation: return new_text, None, cover_length else: trans = cm.patchingBeforeReturn(start_non_alpha, end_non_alpha, trans, txt) dd(f'findByReduction: looking for: [{msg}] trans:[{trans}] function_name:[{function_name}]') return new_text, trans, cover_length except Exception as e: df.LOG(f'{e}; msg:{msg}', error=True) raise e
def get_hd_url(crawler, count=200): try: videos = Video.select().where(Video.downloaded == 0).limit(count) if videos.count() > 0: view_ids = [video.view_id for video in videos] result = crawler.get_hd_detail(view_ids, raw=True) dd(result) except BaseException: result = crawler.get_pending_data() finally: urls = [ detail['download_url'] for detail in result if detail is not None ] deleted = set(view_ids) - set( [detail['view_id'] for detail in result if detail is not None]) deleted_count = len(deleted) # if deleted_count > 0: # deleted_count = Video.update(downloaded=-1).where(Video.view_id << deleted).execute() print('craw succeed count {}, deleted count {}'.format( len(set(view_ids)) - deleted_count, deleted_count)) with open('data/urls.txt', 'w') as f: for url in urls: f.write(url + "\n") return urls
def isIgnored(msg): if not msg: return True # orig_msg = str(msg) # ex_ga_msg = cm.EXCLUDE_GA.findall(msg) # if (len(ex_ga_msg) > 0): # msg = ex_ga_msg[0] # dd("GA trimmed from:", orig_msg, msg) try: is_mm = isinstance(msg, MatcherRecord) if is_mm: dd('debug') find_msg = msg.lower() is_keep = Ignore.isKeep(find_msg) if is_keep: return False is_allowed_contains = Ignore.isKeepContains(find_msg) if is_allowed_contains: return False is_ref_link = is_function = is_ignore_word = is_dos_command = is_ignore_start = False is_ref_link = cm.isLinkPath(find_msg) if not is_ref_link: is_function = (df.FUNCTION.search(find_msg) is not None) if not is_function: is_ignore_word = Ignore.isIgnoredWord(find_msg) if not is_ignore_word: is_dos_command = Ignore.isDosCommand(find_msg) if not is_dos_command: is_ignore_start = Ignore.isIgnoredIfStartsWith( find_msg) is_ignore = (is_function or is_ignore_word or is_dos_command or is_ignore_start or is_ref_link) # is_ignore = (is_ignore_word or is_dos_command or is_ignore_start) if is_ignore: #dd("checking for ignore") dict_ignore = { "is_ignore_word": is_ignore_word, "is_dos_command": is_dos_command, "is_ignore_start": is_ignore_start, "is_function": is_function, "is_ref_link": is_ref_link } dd("IGNORING:", msg) pp(dict_ignore) return is_ignore except Exception as e: df.LOG(f'{e}; msg:{msg}', error=True) raise e
def tranByPartitioning(self, sl_txt): def markTranslated(txt_loc, sl_txt, tl_txt): ft_obs.markLocAsUsed(txt_loc) entry=(txt_loc, tl_txt) ft_translated_list.append(entry) fname = INP.currentframe().f_code.co_name ft_map = cm.genmap(sl_txt) ft_obs = LocationObserver(sl_txt) ft_translated_list = [] part_txt = None try: ft_translation = str(sl_txt) for ft_loc, ft_word in ft_map: if ft_obs.isCompletelyUsed(): break if ft_obs.isLocUsed(ft_loc): continue part_txt = ft_word ft_tran, selected_item, matched_ratio, untran_word_dic = self.simpleFuzzyTranslate( ft_word, acceptable_rate=df.FUZZY_ACCEPTABLE_RATIO ) if ft_tran: markTranslated(ft_loc, ft_word, ft_tran) else: wc = len(ft_word.split()) is_single_word = (wc == 1) if not is_single_word: continue chopped_txt, ft_tran = self.findByChangeSuffix(ft_word) if ft_tran: markTranslated(ft_loc, ft_word, ft_tran) ft_translated_list.sort(reverse=True) for ft_loc, ft_tran in ft_translated_list: ft_translation = cm.jointText(ft_translation, ft_tran, ft_loc) is_translated = (ft_translation != sl_txt) return_tran = (ft_translation if is_translated else None) dd(f'{fname}() msg:[{sl_txt}] tran_sub_text:[{ft_translation}]') except Exception as e: return_tran = None df.LOG(f'{e}; [{sl_txt}] dealing with: [{part_txt}]', error=True) un_tran_list = ft_obs.getUnmarkedPartsAsDict() return return_tran, un_tran_list
def makeNonSRRecord(self, txt, root_location): sr = self.reproduce() print(f'IS TEXT:[{txt}]') current_processed_list = self.processed_list.keys() is_ignore = (txt in current_processed_list) if is_ignore: dd(f'makeNonSRRecord: [{txt}] is already processed') return None sr.__init__(root_loc=root_location, tran_sl_txt=txt, translation_engine=self.tf, processed_dict=self.processed_list, glob_sr=self.global_sr_list) sr.setupRecords() sr.getTextListTobeTranslated() self.global_sr_list.update({sr.tran_sl_txt: sr}) return sr
def makeSRRecord(self, txt, root_location): try: dict_sl_pat, ( dict_sl_txt, dict_sl_word_list, dict_sl_mm, dict_tl_txt, dict_tl_word_list, dict_tl_mm, sent_sl_mm) = self.getDict().getSentStructPattern(txt) current_processed_list = self.processed_list.keys() is_already_processed = (dict_sl_txt in current_processed_list) is_ignore = (not dict_sl_pat) or (is_already_processed) if is_already_processed: dd(f'makeSRRecord: [{txt}] is already processed') if is_ignore: return None print( f'IS STRUCTURE:[txt:{txt}] => sl:[{dict_sl_txt}] tl:[{dict_tl_txt}] pat:[{dict_sl_pat}]' ) sr = self.reproduce() sr.__init__(root_loc=root_location, dict_sl_txt=dict_sl_txt, dict_sl_word_list=dict_sl_word_list, dict_sl_rec=dict_sl_mm, dict_tl_rec=dict_tl_mm, dict_tl_word_list=dict_tl_word_list, dict_tl_txt=dict_tl_txt, tran_sl_txt=txt, tran_sl_rec=sent_sl_mm, recog_pattern=dict_sl_pat, translation_engine=self.tf, processed_dict=self.processed_list, glob_sr=self.global_sr_list) sr.setupRecords() need_tran = sr.getTextListTobeTranslated() if need_tran: df.LOG(f'needed tran:{need_tran}') self.global_sr_list.update({sr.tran_sl_txt: sr}) else: df.LOG(f'NO need translations, PROVIDED or LEAVE AS IS!') return sr except Exception as e: df.LOG(e, error=True)
def translateMatcherRecord(self, mm: MatcherRecord): sub_loc: tuple = None try: ref_txt = mm.getSubText() sub_loc = mm.getSubLoc() ref_type = mm.type is_blank_quote = (ref_type == RefType.BLANK_QUOTE) is_kbd = (ref_type == RefType.KBD) is_abbr = (ref_type == RefType.ABBR) is_menu = (ref_type == RefType.MENUSELECTION) is_ga = (ref_type == RefType.GA) is_ref = (ref_type == RefType.REF) is_doc = (ref_type == RefType.DOC) is_osl_attrib = (ref_type == RefType.OSL_ATTRIB) is_term = (ref_type == RefType.TERM) # ---------- is_ast = (ref_type == RefType.AST_QUOTE) is_dbl_ast_quote = (ref_type == RefType.DBL_AST_QUOTE) is_dbl_quote = (ref_type == RefType.DBL_QUOTE) is_sng_quote = (ref_type == RefType.SNG_QUOTE) is_python_format = (ref_type == RefType.PYTHON_FORMAT) is_function = (ref_type == RefType.FUNCTION) is_quoted = (is_ast or is_dbl_quote or is_sng_quote or is_dbl_ast_quote or is_blank_quote) converted_to_abbr = False if is_kbd: dd(f'translateRefItem: is_kbd:{ref_txt}') ok = self.tf.translateKeyboard(mm) elif is_abbr: dd(f'translateRefItem: is_abbr:{ref_txt}') ok = self.tf.translateAbbrev(mm) elif is_menu: dd(f'translateRefItem: is_menu:{ref_txt}') ok = self.tf.translateMenuSelection(mm) elif is_quoted: dd(f'translateRefItem: is_quoted:{ref_txt}') ok = self.tf.translateQuoted(mm) converted_to_abbr = True elif is_osl_attrib or is_python_format or is_function: return else: ok = self.tf.translateRefWithLink(mm) # mm_tran = cm.jointText(ref_txt, tran, sub_loc) # mm.setTranlation(mm_tran, is_fuzzy, is_ignore) except Exception as e: df.LOG(f'{e} ref_item:{mm}, ref_type:{ref_type}', error=True)
def __init__(self, input_k=None, item_found=None, item_left=None, item_right=None, input_k_left=None, input_k_right=None): self.input_k_mid = None self.input_k_mid_loc = None self.input_k: str = input_k self.item_found: str = item_found self.item_left = item_left self.item_right = item_right # self.item_mid = self.findExpVarPart(item_found, item_left, item_right) self.input_k_left = input_k_left self.input_k_right = input_k_right self.input_k_mid, self.input_k_mid_loc = self.findExpVarPart( input_k, input_k_left, input_k_right) # self.input_k_mid = mid # self.input_k_mid_loc = loc dd(f'FuzzyExpVarRecord() - self.input_k_mid:[{self.input_k_mid}]; self.input_k_mid_loc:[{self.input_k_mid_loc}]' )
def getTranBySlittingSymbols(self, input_txt): fname = INP.currentframe().f_code.co_name pattern_list = [ df.NON_SPACE_SYMBOLS, df.SYMBOLS, ] translation = str(input_txt) translated_list = [] selective_list = [] tran, untran_dict = self.tranByPartitioning(input_txt) if tran: return tran for pat in pattern_list: word_list = cm.splitWordAtToList(pat, input_txt) if not word_list: continue for loc, txt in word_list: tran, untran_dict = self.tranByPartitioning(txt) is_translated = (tran != txt) if is_translated: entry = (loc, tran) translated_list.append(entry) translated_list.sort(reverse=True) for loc, tran in translated_list: translation = cm.jointText(translation, tran, loc) is_translated = (translation != input_txt) if is_translated: dd(f'{fname}(): input_txt:[{input_txt}]=>[{translation}]') return translation else: return None
def findPattern(self, pattern_list: list, txt: str): count_item = 0 obs = LocationObserver(txt) pattern_list.reverse() obs = LocationObserver(txt) for index, item in enumerate(pattern_list): p, ref_type = item self.findOnePattern(obs, obs.blank, p, ref_type) self.validateFoundEntries() if len(self): dd('List of refs found:') dd('-' * 80) pp(self) dd('-' * 80) return count_item, obs.getUnmarkedPartsAsDict()
def build_finished(app, exeption): def refListGetKey(item): return item[1] def dicListGetKey(item): k, v = item trans, rep = v return rep # loc_dic_list={} # sorted_list = sorted(trans_finder.dic_list.items(), key=refListGetKey) # # pp(sorted_list) # for txt, rep in sorted_list: # if ig.isIgnored(txt): # continue # # #txt = txt.strip() # must_mark = False # trans = trans_finder.findTranslation(txt) # if not trans: # trans = trans_finder.findTranslationByFragment(txt) # must_mark = True # # #is_same = (txt == trans) # k = txt # if (rep > 1) or must_mark: # if trans is None: # trans = "" # v = "{}#{}#".format(trans, rep) # else: # v = trans # entry={k:v} # loc_dic_list.update(entry) # sorted_list = sorted(loc_dic_list.items(), key=dicListGetKey) # return # file_name = "/Users/hoangduytran/ref_dict_0001.json" # dic = cm.removeLowerCaseDic(loc_dic_list) # dic = trans_finder.master_dic_backup_list # dic.update(trans_finder.master_dic_list) # dic = trans_finder.dic_list # has_dic = (len(dic) > 0) # if not has_dic: # return # clean_dic = trans_finder.removeIgnoredEntries(dic) # dic = clean_dic # pp(dic) # exit(0) # sorted_list = sorted(dic.items(), key=lambda x: x[1]) # dic = trans_finder.master_dic_backup_list # file_name = trans_finder.master_dic_backup_file # print("Writing dictionary to:", file_name) # with open(file_name, 'w', newline='\n', encoding='utf8') as out_file: # json.dump(dic, out_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': ')) # with open(file_name, "r") as f: # data = json.load(f) # pp(data) # for k, v in data.items(): # is_null = (v == None) # if is_null: # entry={k:""} # print("updating entry:", entry) # data.update(entry) # with open(file_name, 'w', newline='\n', encoding='utf8') as out_file: # json.dump(dic, out_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': ')) # with open(file_name, 'w', newline='\n', encoding='utf8') as out_file: # json.dump(dic, out_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': ')) # trans_finder.writeJSONDic(dic_list=sorted_list, file_name="/home/htran/20191228_dict_0001.json") # pp(sorted_list) # exit(0) trans_finder.writeChosenDict(is_master=True) trans_finder.writeChosenDict(is_master=False) # trans_finder.writeBackupDict() # trans_finder.writeMasterDict() dd('DEBUG')
def getSentStructPattern(self, key): def isMatchedStructMode(pat_matched_text_pair_list): is_ok_list=[] (s_mode_dict, input_txt_list) = pat_matched_text_pair_list s_mode_dict_list = list(s_mode_dict.values()) for index, matched_part in enumerate(input_txt_list): smode_item = s_mode_dict_list[index] (dict_sl_txt, structure_mode_list) = smode_item is_txt_only = (structure_mode_list is None) if is_txt_only: continue smode_rec: SMODEREC = None for smode_rec in structure_mode_list: structure_mode = smode_rec.smode extra_param = smode_rec.extra_param is_digits_only = (structure_mode == SMODE.NUMBER_ONLY) if is_digits_only: is_number = (df.NUMBERS.search(matched_part) is not None) is_ok_list.append(is_number) continue is_no_full_stop = (structure_mode == SMODE.NO_FULL_STOP) if is_no_full_stop: no_fullstop = (df.FULLSTOP_IN_BETWEEN.search(matched_part) is None) is_ok_list.append(no_fullstop) continue is_no_punctuation = (structure_mode == SMODE.NO_PUNCTUATION) if is_no_punctuation: no_punct = (df.PUNCT_IN_BETWEEN.search(matched_part) is None) is_ok_list.append(no_punct) continue is_max_upto = (structure_mode == SMODE.MAX_UPTO) if is_max_upto: wc = cm.wordCount(matched_part) is_max_upto = (wc <= extra_param) is_ok_list.append(is_max_upto) continue is_no_conjunctives = (structure_mode == SMODE.NO_CONJUNCTIVES) if is_no_conjunctives: no_conjunctives = (df.BASIC_CONJUNCTS.search(matched_part) is None) is_ok_list.append(no_conjunctives) continue ok = (False not in is_ok_list) return ok def filterKeyChosenSet(pat_item): match = re.compile(pat_item, flags=re.I).search(key) is_found = (match is not None ) return is_found try: default_value = (None, None, None, None, None, None, None) klower = key.lower() has_cached_key_value_set = (klower in self.local_text_and_chosen_sent_struct_list) if has_cached_key_value_set: set_value = self.local_text_and_chosen_sent_struct_list[klower] (pat, value) = set_value return (pat, value) selective_match = [] key_list = self.sentence_struct_dict.keys() # pat_list = [x for x in pat_list if 'i\\.e\\.' in x] chosen_key_list = list(filter(filterKeyChosenSet, key_list)) if not chosen_key_list: return (None, default_value) df.LOG(f'matched key: [{key}]') pp(chosen_key_list, width=200) # list_of_sent_struct = list(self.sentence_struct_dict.items()) for pat in chosen_key_list: value = self.sentence_struct_dict[pat] pattern = re.compile(pat, flags=re.I) # match_list = re.findall(pat, key) matcher_dict = cm.patternMatchAll(pattern, key) sent_sl_record = list(matcher_dict.values())[0] loc_text_list = sent_sl_record.getSubEntriesAsList() interest_part = loc_text_list[1:] if not interest_part: interest_part = loc_text_list unique_parts = cm.removeDuplicationFromlistLocText(interest_part) temp_dict = OrderedDict(unique_parts) matched_text_group = temp_dict.values() # matched_text_group = matcher.groups() (dict_sl_txt, dict_sl_word_list, dict_sl_mm, dict_tl_txt, dict_tl_word_list, dict_tl_mm) = value s_mode_list = dict_sl_mm.smode df.LOG(f'matched_text_group:[{matched_text_group}]') pattern_and_matched_text_pair_list = (s_mode_list, matched_text_group) df.LOG(f'pattern_and_matched_text_pair_list:') df.LOG('-' * 80) pp(pattern_and_matched_text_pair_list) df.LOG('-' * 80) try: is_accept = isMatchedStructMode(pattern_and_matched_text_pair_list) except Exception as e: is_accept = False if not is_accept: continue match_rate = fuzz.ratio(dict_sl_txt, key) sent_sl_record.clear() sent_sl_record.update(unique_parts) value = (*value, sent_sl_record) entry=(match_rate, pat, value) selective_match.append(entry) if not selective_match: return (None, default_value) selective_match.sort(reverse=True) dd('-' * 80) pp(selective_match) dd('-' * 80) first_entry = selective_match[0] (match_rate, pat, value) = first_entry pattern = re.compile(pat, flags=re.I) cached_entry = {klower: (pattern, value)} self.local_text_and_chosen_sent_struct_list.update(cached_entry) return (pattern, value) except Exception as e: df.LOG(f'{e};', error=True) raise e
def simpleFuzzyTranslate(self, msg: str, acceptable_rate=df.FUZZY_ACCEPTABLE_RATIO): def getItemPart(item_txt): item_word_list = item_txt.split() item_first_word = item_word_list[0] item_part = item_first_word[:max_k_length] return item_part def validate(item): item_part = getItemPart(item) is_found = (item_part.lower() == k_part.lower()) if not is_found: return -1, None allowable_length = (k_length * (1 + df.MAX_FUZZY_TEST_LENGTH)) if is_k_single_word: item_len = len(item) acceptable = (item_len <= allowable_length) else: word_count = len(item.split()) item_word_count_is_greater_or_equal_k_word_count = (word_count >= k_word_count) item_word_count_is_smaller_than_allowable_k_word_count = (word_count <= allowed_k_word_count) acceptable = (item_word_count_is_greater_or_equal_k_word_count and item_word_count_is_smaller_than_allowable_k_word_count) return_result = (1 if acceptable else 0) return return_result, item def findListOfCandidates(): subset = [] index = cm.binarySearch(key_list, k, key=getItemPart) is_found = (index >= 0) if not is_found: return subset found_list = [] ss = index ee = index # dd(f'simpleFuzzyTranslate(): start index: [{index}]') for i in range(index-1, 0, -1): item = key_list[i] cond, found_item = validate(item) is_break = (cond == -1) if is_break: # dd(f'simpleFuzzyTranslate(): traverse backward, stopped at: [{i}], item:[{item}]') break is_accepted = (cond == 1) if is_accepted: found_list.append(item) ss = i # dd(f'simpleFuzzyTranslate(): backward to index: [{i}]') for i in range(index, len(key_list)): item = key_list[i] cond, found_item = validate(item) is_break = (cond == -1) if is_break: # dd(f'simpleFuzzyTranslate(): traverse forward, stopped at: [{i}], item:[{item}]') break is_accepted = (cond == 1) if is_accepted: found_list.append(item) ee = i found_list.sort(key=lambda x: len(x), reverse=True) for found_item in found_list: ratio = fuzz.ratio(found_item, k) is_found = (ratio >= df.FUZZY_LOW_ACCEPTABLE_RATIO) if not is_found: continue entry = (ratio, found_item) subset.append(entry) subset.sort(reverse=True) return subset def simpleFindListOfCandidates(): found_list = [] for found_item in key_list: ratio = fuzz.ratio(found_item, k) partial_ratio = fuzz.partial_ratio(found_item, k) entry = (ratio, partial_ratio, found_item) found_list.append(entry) if found_list: if is_k_single_word: found_list.sort(key=OP.itemgetter(1, 0), reverse=True) else: found_list.sort(key=OP.itemgetter(0, 1), reverse=True) return [found_list[0]] else: return found_list def isKeyListTextValid(dict_item): is_same = (k == dict_item.lower()) is_starts_with_k_part = (dict_item.startswith(k_part)) is_word_len_acceptable = (len(dict_item.split()) <= k_word_count * 1.5) key_len = len(dict_item) is_total_len_acceptable = (0.5 <= key_len <= k_length * 1.5) return is_same or (is_starts_with_k_part and is_word_len_acceptable and is_total_len_acceptable) def binSearchFunction(item): return item[:max_k_length] def simpleKeyListGetting(): def findInRange(start_index, end_index, step): local_found=[] for i in range(start_index, end_index, step): dict_item = dict_keys[i] is_found = isKeyListTextValid(dict_item) if is_found: local_found.append(dict_item) else: dict_part = binSearchFunction(dict_item) is_match = (dict_part == k_part) if not is_match: break else: continue return local_found found_list=[] dict_keys = self.local_keys index = cm.binarySearch(dict_keys, k, key=binSearchFunction) is_found = (index >= 0) if not is_found: return found_list before_items = dict_keys[index-10: index] after_items = dict_keys[index: index+10] test_item = dict_keys[index] dict_key_len = len(dict_keys) start_stop_at=None found_before = findInRange(index, 0, -1) found_after = findInRange(index+1, dict_key_len, 1) found_list.extend(found_before) found_list.extend(found_after) return found_list untran_word_dic = {} left, k, right = cm.getTextWithin(msg) k = k.lower() k_length = len(k) k_word_list = k.split() k_word_count = len(k_word_list) is_k_single_word = (k_word_count == 1) allowed_k_word_count = int(k_word_count * 1.7) k_matching_length = int(ceil(k_length * 0.5)) if not is_k_single_word: first_word = k_word_list[0] first_word_len = len(first_word) is_two_small = (first_word_len < 3) k_matching_length = int(ceil(first_word_len * df.MAX_FUZZY_TEST_LENGTH)) if is_two_small: try: second_word = k_word_list[1] second_word_len = len(second_word) k_matching_length = int(ceil((first_word_len + second_word_len + 1) * df.MAX_FUZZY_TEST_LENGTH)) except Exception as e: pass first_word = k_word_list[0] first_word_len = len(first_word) if is_k_single_word: max_k_length = int(first_word_len * 0.5) else: max_k_length = int(first_word_len * 0.8) k_part = first_word[:max_k_length] # key_list = [x for x in self.local_keys if x.startswith(k_part)] is_cached = (k in self.local_keylist_cache) if is_cached: key_list = self.local_keylist_cache[k] else: tic = time.perf_counter() # key_list = list(filter(isKeyListTextValid, self.local_keys)) key_list = simpleKeyListGetting() tok = time.perf_counter() taken = (tok - tic) self.local_keylist_cache.update({k: key_list}) # pp(key_list) subset = key_list # subset = findListOfCandidates() subset = simpleFindListOfCandidates() found_candidates = (len(subset) > 0) if not found_candidates: return_tran = None rat = 0 selected_item = None return return_tran, selected_item, rat, untran_word_dic matched_ratio, partial_ratio, selected_item = subset[0] is_accepted = ((partial_ratio >= acceptable_rate) if is_k_single_word else (matched_ratio >= acceptable_rate)) if not is_accepted: perfect_match_percent = cm.matchTextPercent(k, selected_item) is_accepted = (perfect_match_percent > df.FUZZY_PERFECT_MATCH_PERCENT) # dd(f'simpleFuzzyTranslate(): perfect_match_percent:[{perfect_match_percent}] k:[{k}] => selected_item:[{selected_item}]; is_accepted:[{is_accepted}]') if not is_accepted: return_tran = None rat = 0 selected_item = None return return_tran, selected_item, rat, untran_word_dic translation_txt = self[selected_item] lower_msg = msg.lower() try: loc, new_selected = cm.locRemain(lower_msg, selected_item) translation = lower_msg.replace(new_selected, translation_txt) untran_word_dic = cm.getRemainedWord(lower_msg, new_selected) except Exception as e: # fname = INP.currentframe().f_code.co_name # dd(f'{fname}() {e}') dd(f'FAILED TO REPLACE: [{lower_msg}] by [{selected_item}] with trans: [{translation_txt}], matched_ratio:[{matched_ratio}]') can_accept = (matched_ratio >= acceptable_rate) if can_accept: translation = translation_txt left, mid, right = cm.getTextWithin(lower_msg) had_the_same_right = (right and translation.endswith(right)) had_the_same_left = (left and translation.startswith(left)) if left and not had_the_same_left: translation = left + translation if right and not had_the_same_right: translation = translation + right dd(f'SIMPLE PATCHING: left:[{left}] right:[{right}] trans: [{translation}]') untran_word_dic = cm.getRemainedWord(lower_msg, selected_item) else: fname = INP.currentframe().f_code.co_name dd(f'{fname}() Unable to locate translation for {msg}') translation = None if translation: translation = self.replaceTranRef(translation) return translation, selected_item, partial_ratio, untran_word_dic
def parseMessage(self): trans = self.tf.isInDict(self.msg) if trans: self.setTranslation(trans, False, False) return is_link_path = cm.isLinkPath(self.msg) if is_link_path: dd(f'parseMessage(): IGNORED [{self.msg}]; is_full_path') return local_msg = str(self.msg) count, unparsed_dict = self.findPattern(df.pattern_list, local_msg) self.addUnparsedDict(unparsed_dict) if len(self): dd('Finishing parseMessage:') dd('-' * 80) for loc, mm_rec in self.items(): dd(f'{loc}') dd(f'{mm_rec.txt}') dd('-' * 80)
def doctree_resolved(app, doctree, docname): def abbreviating(): remove_items = [] new_items = {} for k, v in trans_finder.master_dic.items(): ref_list = RefList(msg=v) new_v = ref_list.quotedToAbbrev(k) has_new_v = (new_v is not None) and (len(new_v) > 0) if has_new_v: new_entry = {k: new_v} new_items.update(new_entry) has_remove_items = (len(remove_items) > 0) if has_remove_items: for k in remove_items: dd(f'Delete from dictionary:[{k}]') del trans_finder.master_dic[k] is_writing_changes = (len(new_items) > 0) if is_writing_changes: trans_finder.master_dic.update(new_items) dic_file = '/Users/hoangduytran/blender_manual/test_dic.json' print( f'Writing changes to: {dic_file}, number of records:{len(new_items)}' ) trans_finder.writeJSONDic(dict_list=trans_finder.master_dic, file_name=dic_file) def checkDictForMultipleMeaningsInTrans(): pattern = re.compile(r'(\w+(/\w+)+)') k: str = None v: str = None for k, v in trans_finder.master_dic.items(): k_found_list = pattern.findall(k) v_found_list = pattern.findall(v) is_same = (len(k_found_list) == len(v_found_list)) if is_same: continue k_is_single_word = (len(k.split(' ')) == 1) if k_is_single_word: continue PP(k_found_list) print('-' * 3) PP(v_found_list) print('-' * 3) print(f'{k}\n\n{v}') print('-' * 40) def trimmingText(text): txt_has_trimmable_ending = (cm.TRIMMABLE_ENDING.search(text) is not None) txt_has_trimmable_beginning = (cm.TRIMMABLE_BEGINNING.search(text) is not None) is_trim = (txt_has_trimmable_ending or txt_has_trimmable_beginning) if not is_trim: return text, False text = cm.TRIMMABLE_BEGINNING.sub('', text) text = cm.TRIMMABLE_ENDING.sub('', text) return text, True def removeDictBeginAndEndingPuncts(): remove_set = {} add_set = {} for k, v in trans_finder.master_dic.items(): # is_debug = ('Cut to' in k) # if is_debug: # dd('DEBUG') trimmed_k, is_trimmed_k = trimmingText(k) trimmed_v, is_trimmed_v = trimmingText(v) changed = (is_trimmed_k or is_trimmed_v) if changed: remove_entry = {k: v} remove_set.update(remove_entry) add_entry = {trimmed_k: trimmed_v} add_set.update(add_entry) print(f'[{k}]') print(f'[{trimmed_k}]') print('-' * 3) print(f'[{v}]') print(f'[{trimmed_v}]') print('-' * 40) changed = False for k, v in remove_set.items(): remove_entry = {k: v} print(f'remove:{remove_entry}') del trans_finder.master_dic[k] changed = True for k, v in add_set.items(): add_entry = {k: v} trans_finder.master_dic.update(add_entry) print(f'added: {add_entry}') changed = True if changed: new_dict = cleanupLeadingTrailingPunct(trans_finder.master_dic) test_to_file = '/Users/hoangduytran/blender_manual/ref_dict_0005.json' trans_finder.writeJSONDic(dict_list=new_dict, file_name=test_to_file) def removeDuplication(txt_with_punct): # is_debug = (txt_with_punct.endswith('::')) # if is_debug: # dd('DEBUG') cropped_txt, begin_with_punctuations, ending_with_punctuations = cm.beginAndEndPunctuation( txt_with_punct, is_single=True) trans = trans_finder.isInList(cropped_txt) is_repeat = (trans is not None) if not is_repeat: cropped_txt, begin_with_punctuations, ending_with_punctuations = cm.beginAndEndPunctuation( txt_with_punct, is_single=False) trans = trans_finder.isInList(cropped_txt) is_repeat = (trans is not None) return is_repeat def cleanupLeadingTrailingPunct(d_dict): return_dict = {} for k, v in d_dict.items(): trimmed_k = str(k) trimmed_v = str(v) found_k = cm.WORD_WITHOUT_QUOTE.search(k) found_v = cm.WORD_WITHOUT_QUOTE.search(v) if found_k: trimmed_k = found_k.group(1) if found_v: trimmed_v = found_v.group(1) entry = {trimmed_k: trimmed_v} return_dict.update(entry) return return_dict def refToDictItems(ref_list): ref_dict = {} ref: RefRecord = None interest_ref = [ RefType.REF, RefType.DOC, RefType.GA, RefType.TERM, ] for ref in ref_list: # print(ref) type = ref.getOrigin().getRefType() first_ref = ref.getRefItemByIndex(0) ref_text = first_ref.getText() is_debug = ('Poor mans steadycam' in ref_text) if is_debug: dd('DEBUG') en_part = None vn_part = None d_dict = {} if type == RefType.MENUSELECTION: print(f'MENUSELECTION:{type}') text_list = cm.MENU_TEXT_REVERSE.findall(ref_text) length = len(text_list) i_index = 0 for i in range(length): tran = text_list[i_index] if i_index + 1 < length: orig = text_list[i_index + 1] else: print('ERROR: Orig is NOT THERE, use original') orig = ref.getOrigin().getText() entry = {orig: tran} print(f'menu:{entry}') d_dict.update(entry) i_index += 2 if i_index >= length: break elif type == RefType.ABBR: print(f'ABBR:{type}') text_list = cm.ABBREV_TEXT_REVERSE.findall(ref_text) abbr = text_list[0] defin = text_list[1] has_further_explanation = (': ' in defin) if has_further_explanation: exp_list = defin.split(': ') orig_part = exp_list[0] further_exp = exp_list[1] print( f'abbr:{abbr}; orig_part:{orig_part}; further_exp:{further_exp}' ) if abbr.isascii(): entry = {abbr: f'{orig_part}, {further_exp}'} elif orig_part.isascii(): entry = {orig_part: f'{further_exp}, {abbr}'} else: entry = {further_exp: f'{orig_part}, {abbr}'} d_dict.update(entry) else: print(f'abbr:{abbr}; defin:{defin}') if defin.isascii(): entry = {defin: abbr} else: entry = {abbr: defin} d_dict.update(entry) elif type in interest_ref: print(f'GENERIC_REF:{type}') text_list = cm.REF_TEXT_REVERSE.findall(ref_text) has_text = (len(text_list) > 0) if not has_text: origin_text = ref.getOrigin().getText() print(f'ERROR: origin_text:{origin_text}') # print(f'{text_list}, appeared to be empty!!!') else: vn_part, en_part = text_list[0] print(f'en_part:{en_part} vn_part:{vn_part}') entry = {en_part: vn_part} d_dict.update(entry) else: dd(f'{type} is not the type we are looking for.') ref_dict.update(d_dict) return_dict = cleanupLeadingTrailingPunct(d_dict) return return_dict def listDictRefsToDict(): interest_ref_list = [ RefType.MENUSELECTION, RefType.REF, RefType.DOC, RefType.GA, RefType.TERM, RefType.ABBR, ] ref_dict = {} ref_dict_filename = '/Users/hoangduytran/blender_manual/ref_dict_refsonly.json' for k, v in trans_finder.master_dic.items(): ref_list = RefList(msg=v, keep_orig=False, tf=trans_finder) ref_list.parseMessage() inter_ref_list = ref_list.getListOfRefType(interest_ref_list) has_ref = (len(inter_ref_list) > 0) if not has_ref: continue current_ref_dict = refToDictItems(inter_ref_list) ref_dict.update(current_ref_dict) has_dict_content = (len(ref_dict) > 0) if has_dict_content: trans_finder.writeJSONDic(dict_list=ref_dict, file_name=ref_dict_filename) def tranRef(msg, is_keep_original): ref_list = RefList(msg=msg, keep_orig=is_keep_original, tf=trans_finder) ref_list.parseMessage() ref_list.translateRefList() tran = ref_list.getTranslation() # trans_finder.addDictEntry((msg, tran)) # print("Got translation from REF_LIST") return tran # def fuzzyTextSimilar(txt1 : str, txt2 : str, accept_ratio): # try: # similar_ratio = LE.ratio(txt1, txt2) # is_similar = (similar_ratio >= accept_ratio) # return is_similar # except Exception as e: # print(e) # return False def getTimeNow(self): local_time = timezone('Europe/London') fmt = '%Y-%m-%d %H:%M%z' loc_dt = local_time.localize(datetime.datetime.now()) formatted_dt = loc_dt.strftime(fmt) return formatted_dt # is_running = runAppOrNot() # if not is_running: # return # correctingDictionary() # checkDictKeyboard() # checkDictRef() # checkNonTranslatedDictWords() # checkDictForMultipleMeaningsInTrans() # removeDictBeginAndEndingPuncts() # listDictRefsToDict() # trans_finder.saveMasterDict() # exit(0) try: is_debug = ('vr_scene_inspection' in docname) if is_debug: dd('DEBUG') ex_env_key = 'EX_PO_TRANS' is_ex_env_set = (ex_env_key in os.environ) if not is_ex_env_set: return ex_env_key_value = os.environ[ex_env_key] is_ex_set_true = (ex_env_key_value.lower() == 'true') if not is_ex_set_true: return debug_file = cm.debug_file if debug_file: is_debug_file = (debug_file in docname) if not is_debug_file: return build_dir = "build/rstdoc" po_vi_dir = "locale/vi/LC_MESSAGES" po_file_path = "{}.po".format(docname) local_path = os.path.dirname(os.path.abspath(__file__)) blender_docs_path = cm.BLENDER_DOCS # os.path.dirname(local_path) locale_vi_path = "locale/vi/LC_MESSAGES" po_path = os.path.join(blender_docs_path, os.path.join(locale_vi_path, po_file_path)) if not os.path.isfile(po_path): msg = f'po_path: {po_path} NOT FOUND!' print(msg) raise Exception(msg) exit(0) # #loading local po file to get translation if any po_dic, current_po_cat = trans_finder.loadPOAsDic(po_path) trans_finder.flatPOFile(po_path) rst_output_location = os.path.join(blender_docs_path, build_dir) output_path = os.path.join(rst_output_location, po_file_path) local_time = timezone(TIME_ZONE) time_now = local_time.localize(datetime.datetime.now()) local_locale = locale.getlocale()[0] current_header = current_po_cat._get_header_comment() new_po_cat = Catalog(locale="vi", header_comment=current_header, project=current_po_cat.project, version=current_po_cat.version, copyright_holder=YOUR_ID, creation_date=current_po_cat.creation_date, revision_date=time_now, last_translator=YOUR_ID, language_team=YOUR_TRANSLATION_TEAM) dd("#" * 80) dd("filename: {}".format(output_path)) # msgid = "Lines should be less than 120 characters long." # msgstr = "Số chữ trong các dòng phải ít hơn 120 ký tự de lam gi." # trans_finder.addDictEntry((msgid, msgstr), False) # exit(0) for node, msg in extract_messages(doctree): msg = msg.strip() dd("=" * 80) dd("msgid:[{}]".format(msg)) # clean up po file is_inline = isinstance(node, nodes.inline) is_emphasis = isinstance(node, nodes.emphasis) is_title = isinstance(node, nodes.title) is_term = isinstance(node, nodes.term) is_rubric = isinstance(node, nodes.rubric) is_field_name = isinstance(node, nodes.field_name) is_reference = isinstance(node, nodes.reference) is_strong = isinstance(node, nodes.strong) is_keep_original = (is_inline or is_emphasis or is_title or is_term or is_rubric or is_field_name or is_reference or is_strong) tran = None # is_debug = ('Get involved in discussions' in msg) # if is_debug: # dd('DEBUG') is_ignore = ig.isIgnored(msg) if is_ignore: print(f'IGNORED: {msg}') continue # is_added = False tran, is_ignore = trans_finder.findTranslation(msg) if is_ignore: continue has_translation = (tran is not None) if not has_translation: is_debug = ('is based on the OpenXR specification' in msg) if is_debug: dd('Debug') ref_list = RefList(msg=msg, keep_orig=is_keep_original, tf=trans_finder) ref_list.parseMessage() ref_list.translateRefList() tran = ref_list.getTranslation() # tran = tranRef(msg, is_keep_original) has_translation = (tran is not None) if not has_translation: tran = po_dic[msg] has_translation = (tran is not None) if has_translation: has_month = ('Tháng ' in tran) has_original = (msg.lower() in tran.lower()) has_link = (cm.REF_LINK.search(tran) is not None) can_ignore = (has_month or has_original or has_link) is_repeat = is_keep_original and not can_ignore if is_repeat: print('Repeating MSG') tran = cm.matchCase(msg, tran) tran = f'{tran} -- {msg}' print(f'Repeating MSG:{tran}') if tran is not None: new_po_cat.add(msg, string=tran) else: new_po_cat.add(msg, string="") print(f'msgid \"{msg}\"') if tran is not None: print(f'msgstr \"{tran}\"') else: print('msgstr \"\"') print("Output to the path:", new_po_cat, output_path) c.dump_po(output_path, new_po_cat) # dd('DEBUG') except Exception as e: df.LOG(f'{e}', error=True)
def refToDictItems(ref_list): ref_dict = {} ref: RefRecord = None interest_ref = [ RefType.REF, RefType.DOC, RefType.GA, RefType.TERM, ] for ref in ref_list: # print(ref) type = ref.getOrigin().getRefType() first_ref = ref.getRefItemByIndex(0) ref_text = first_ref.getText() is_debug = ('Poor mans steadycam' in ref_text) if is_debug: dd('DEBUG') en_part = None vn_part = None d_dict = {} if type == RefType.MENUSELECTION: print(f'MENUSELECTION:{type}') text_list = cm.MENU_TEXT_REVERSE.findall(ref_text) length = len(text_list) i_index = 0 for i in range(length): tran = text_list[i_index] if i_index + 1 < length: orig = text_list[i_index + 1] else: print('ERROR: Orig is NOT THERE, use original') orig = ref.getOrigin().getText() entry = {orig: tran} print(f'menu:{entry}') d_dict.update(entry) i_index += 2 if i_index >= length: break elif type == RefType.ABBR: print(f'ABBR:{type}') text_list = cm.ABBREV_TEXT_REVERSE.findall(ref_text) abbr = text_list[0] defin = text_list[1] has_further_explanation = (': ' in defin) if has_further_explanation: exp_list = defin.split(': ') orig_part = exp_list[0] further_exp = exp_list[1] print( f'abbr:{abbr}; orig_part:{orig_part}; further_exp:{further_exp}' ) if abbr.isascii(): entry = {abbr: f'{orig_part}, {further_exp}'} elif orig_part.isascii(): entry = {orig_part: f'{further_exp}, {abbr}'} else: entry = {further_exp: f'{orig_part}, {abbr}'} d_dict.update(entry) else: print(f'abbr:{abbr}; defin:{defin}') if defin.isascii(): entry = {defin: abbr} else: entry = {abbr: defin} d_dict.update(entry) elif type in interest_ref: print(f'GENERIC_REF:{type}') text_list = cm.REF_TEXT_REVERSE.findall(ref_text) has_text = (len(text_list) > 0) if not has_text: origin_text = ref.getOrigin().getText() print(f'ERROR: origin_text:{origin_text}') # print(f'{text_list}, appeared to be empty!!!') else: vn_part, en_part = text_list[0] print(f'en_part:{en_part} vn_part:{vn_part}') entry = {en_part: vn_part} d_dict.update(entry) else: dd(f'{type} is not the type we are looking for.') ref_dict.update(d_dict) return_dict = cleanupLeadingTrailingPunct(d_dict) return return_dict