def is_jp_word(word): """Verifies if given word has japanese chars. Parameters: word (str): word to evaluate Returns: bool: wheter or not japanese chars exists in the string """ return any([is_cjk(char) for char in word])
def extract_cjk(mixed_string): """ Takes a string with english in it and spits out only the Japanese parts This is mostly for the tooltip attributes :param mixed_string: A string with some Japanese and some other language in it :return: A string with only the Japanese parts """ jp_only = [] for char in mixed_string: if is_cjk(char): jp_only.append(char) return "".join(jp_only)
def main(source, trim, dest, prefix, key): tl_dict = {} file_name = os.path.split(source)[1] with open("new_file", "w") as new_file: with open(source) as old_file: # the prefix for the json keys file_path = os.path.normpath(old_file.name) file_path = file_path.replace(os.path.splitext(file_path)[1], "") file_ext = os.path.splitext(file_name)[1] # removes a subset of directories from the filepath up to and including the trim argument if trim: trim_index = file_path.find(trim) if trim_index == -1: raise ValueError("Subdir path doesn't exist") trim_path = file_path[(trim_index + len(trim)):].replace( trim, "") split_path = trim_path.split(os.sep)[1:] camelCase = [s.title() for s in split_path[1:]] strip_punctuation = [s.replace("_", "") for s in camelCase] file_prefix = "".join([split_path[0]] + strip_punctuation) # sets the translation tag to a manually input key elif key: file_prefix = key # removes everything in the path up to zenclerk else: split_path = file_path.split(os.sep)[1:][-3:] camelCase = [s.title() for s in split_path[1:]] strip_punctuation = [s.replace("_", "") for s in camelCase] file_prefix = "".join([split_path[0]] + strip_punctuation) for line in old_file: if any([is_cjk(char) for char in line]): japanese = extract_cjk(line) romaji = "_".join( re.sub(f'[()−]', '', kanji_to_romaji(japanese)).split()[:3]) tl_dict[romaji] = japanese i18n_tag = { ".html": f"{{{{ '{file_prefix}.{romaji}' | translate }}}}", ".js": f"$translate.instant('{file_prefix}.{romaji}')", ".rb": f"t('{romaji}')" } if file_ext == ".html": new_file.write( str(re.sub( japanese, i18n_tag[file_ext], line, ))) elif file_ext == ".js": new_file.write( str( re.sub( f"'{japanese}'", i18n_tag[file_ext], line, ))) else: new_file.write( str( re.sub( f"'{japanese}'", i18n_tag[file_ext], line, ))) else: new_file.write(line) shutil.move(os.path.join(os.getcwd(), "new_file"), source) # path to save the json file to base_dir = os.path.split(old_file.name)[0] if file_ext == '.html' or file_ext == '.js': i18n_file = ( f"{os.path.splitext(os.path.split(old_file.name)[1])[0]}{prefix}.json" ) if dest: home = os.getcwd() output_path = os.path.join(home, dest, i18n_file) os.makedirs(os.path.dirname(output_path), exist_ok=True) else: # write out a json file adjacent to the html file json_path = os.path.join(base_dir, i18n_file) output_path = json_path with open(output_path, "w+") as rj: if key: json_key = key.split('.') romaji_dict = {json_key[-1]: tl_dict} for k in list(reversed(json_key[:-1])): romaji_dict = {k: romaji_dict} json.dump(romaji_dict, rj, ensure_ascii=False, indent=2) else: json.dump({file_prefix: tl_dict}, rj, ensure_ascii=False, indent=2) rj.write("\n") else: i18n_file = ( f"{os.path.splitext(os.path.split(old_file.name)[1])[0]}{prefix}.yml" ) if dest: home = os.getcwd() output_path = os.path.join(home, dest, i18n_file) os.makedirs(os.path.dirname(output_path), exist_ok=True) else: # write out a json file adjacent to the html file json_path = os.path.join(base_dir, i18n_file) output_path = json_path with open(output_path, "w+") as rj: yaml.dump({file_prefix: tl_dict}, rj, encoding='utf-8', allow_unicode=True, default_flow_style=False) rj.write("\n")
def tokenize(self, tokens, return_str=False): """ Python port of the Moses detokenizer. :param tokens: A list of strings, i.e. tokenized text. :type tokens: list(str) :return: str """ # Convert the list of tokens into a string and pad it with spaces. text = u" {} ".format(" ".join(tokens)) # Converts input string into unicode. text = text_type(text) # Detokenize the agressive hyphen split. regexp, subsitution = self.AGGRESSIVE_HYPHEN_SPLIT text = re.sub(regexp, subsitution, text) # Unescape the XML symbols. text = self.unescape_xml(text) # Keep track of no. of quotation marks. quote_counts = {u"'":0 , u'"':0} # The *prepend_space* variable is used to control the "effects" of # detokenization as the function loops through the list of tokens and # changes the *prepend_space* accordingly as it sequentially checks # through the language specific and language independent conditions. prepend_space = " " detokenized_text = "" tokens = text.split() # Iterate through every token and apply language specific detokenization rule(s). for i, token in enumerate(iter(tokens)): # Check if the first char is CJK. if is_cjk(token[0]): # Perform left shift if this is a second consecutive CJK word. if i > 0 and is_cjk(token[-1]): detokenized_text += token # But do nothing special if this is a CJK word that doesn't follow a CJK word else: detokenized_text += prepend_space + token prepend_space = " " # If it's a currency symbol. elif token in self.IsSc: # Perform right shift on currency and other random punctuation items detokenized_text += prepend_space + token prepend_space = "" elif re.match(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$', token): # In French, these punctuations are prefixed with a non-breakable space. if self.lang == 'fr' and re.match(r'^[\?\!\:\;\\\%]$', token): detokenized_text += " " # Perform left shift on punctuation items. detokenized_text += token prepend_space = " " elif (self.lang == 'en' and i > 0 and re.match(u'^[\'][{}]'.format(self.IsAlpha), token) and re.match(u'[{}]'.format(self.IsAlnum), token)): # For English, left-shift the contraction. detokenized_text += token prepend_space = " " elif (self.lang == 'cs' and i > 1 and re.match(r'^[0-9]+$', tokens[-2]) # If the previous previous token is a number. and re.match(r'^[.,]$', tokens[-1]) # If previous token is a dot. and re.match(r'^[0-9]+$', token)): # If the current token is a number. # In Czech, left-shift floats that are decimal numbers. detokenized_text += token prepend_space = " " elif (self.lang in ['fr', 'it'] and i <= len(tokens)-2 and re.match(u'[{}][\']$'.format(self.IsAlpha), token) and re.match(u'^[{}]$'.format(self.IsAlpha), tokens[i+1])): # If the next token is alpha. # For French and Italian, right-shift the contraction. detokenized_text += prepend_space + token prepend_space = "" elif (self.lang == 'cs' and i <= len(tokens)-3 and re.match(u'[{}][\']$'.format(self.IsAlpha), token) and re.match(u'^[-–]$', tokens[i+1]) and re.match(u'^li$|^mail.*', tokens[i+2], re.IGNORECASE)): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i) # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail) detokenized_text += prepend_space + token + tokens[i+1] next(tokens, None) # Advance over the dash prepend_space = "" # Combine punctuation smartly. elif re.match(r'''^[\'\"„“`]+$''', token): normalized_quo = token if re.match(r'^[„“”]+$', token): normalized_quo = '"' quote_count.get(normalized_quo, 0) if self.lang == 'cs' and token == u"„": quote_count[normalized_quo] = 0 if self.lang == 'cs' and token == u"“": quote_count[normalized_quo] = 1 if quote_count[normalized_quo] % 2 == 0: if (self.lang == 'en' and token == u"'" and i > 0 and re.match(r'[s]$', tokens[i-1]) ): # Left shift on single quote for possessives ending # in "s", e.g. "The Jones' house" detokenized_text += token prepend_space = " " else: # Right shift. detokenized_text += prepend_space + token prepend_space = "" quote_count[normalized_quo] += 1 else: # Left shift. text += token prepend_space = " " quote_count[normalized_quo] += 1 elif (self.lang == 'fi' and re.match(r':$', tokens[i-1]) and re.match(self.FINNISH_REGEX, token)): # Finnish : without intervening space if followed by case suffix # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ... detokenized_text += prepend_space + token prepend_space = " " else: detokenized_text += prepend_space + token prepend_space = " " # Merge multiple spaces. regexp, subsitution = self.ONE_SPACE detokenized_text = re.sub(regexp, subsitution, detokenized_text) # Removes heading and trailing spaces. detokenized_text = detokenized_text.strip() return detokenized_text if return_str else detokenized_text.split()
def tokenize(self, tokens, return_str=False, unescape=True): """ Python port of the Moses detokenizer. :param tokens: A list of strings, i.e. tokenized text. :type tokens: list(str) :return: str """ # Convert the list of tokens into a string and pad it with spaces. text = u" {} ".format(" ".join(tokens)) # Converts input string into unicode. text = text_type(text) # Detokenize the agressive hyphen split. regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT text = re.sub(regexp, substitution, text) if unescape: # Unescape the XML symbols. text = self.unescape_xml(text) # Keep track of no. of quotation marks. quote_counts = {u"'": 0, u'"': 0, u"``": 0, u"`": 0, u"''": 0} # The *prepend_space* variable is used to control the "effects" of # detokenization as the function loops through the list of tokens and # changes the *prepend_space* accordingly as it sequentially checks # through the language specific and language independent conditions. prepend_space = " " detokenized_text = "" tokens = text.split() # Iterate through every token and apply language specific detokenization rule(s). for i, token in enumerate(iter(tokens)): # Check if the first char is CJK. if is_cjk(token[0]): # Perform left shift if this is a second consecutive CJK word. if i > 0 and is_cjk(token[-1]): detokenized_text += token # But do nothing special if this is a CJK word that doesn't follow a CJK word else: detokenized_text += prepend_space + token prepend_space = " " # If it's a currency symbol. elif token in self.IsSc: # Perform right shift on currency and other random punctuation items detokenized_text += prepend_space + token prepend_space = "" elif re.search(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$', token): # In French, these punctuations are prefixed with a non-breakable space. if self.lang == 'fr' and re.search(r'^[\?\!\:\;\\\%]$', token): detokenized_text += " " # Perform left shift on punctuation items. detokenized_text += token prepend_space = " " elif (self.lang == 'en' and i > 0 and re.search(u"^[\'][{}]".format(self.IsAlpha), token)): # and re.search(u'[{}]$'.format(self.IsAlnum), tokens[i-1])): # For English, left-shift the contraction. detokenized_text += token prepend_space = " " elif (self.lang == 'cs' and i > 1 and re.search( r'^[0-9]+$', tokens[-2]) # If the previous previous token is a number. and re.search(r'^[.,]$', tokens[-1]) # If previous token is a dot. and re.search(r'^[0-9]+$', token)): # If the current token is a number. # In Czech, left-shift floats that are decimal numbers. detokenized_text += token prepend_space = " " elif (self.lang in ['fr', 'it', 'ga'] and i <= len(tokens) - 2 and re.search(u'[{}][\']$'.format(self.IsAlpha), token) and re.search(u'^[{}]$'.format(self.IsAlpha), tokens[i + 1])): # If the next token is alpha. # For French and Italian, right-shift the contraction. detokenized_text += prepend_space + token prepend_space = "" elif (self.lang == 'cs' and i <= len(tokens) - 3 and re.search(u'[{}][\']$'.format(self.IsAlpha), token) and re.search(u'^[-–]$', tokens[i + 1]) and re.search(u'^li$|^mail.*', tokens[i + 2], re.IGNORECASE) ): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i) # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail) detokenized_text += prepend_space + token + tokens[i + 1] next(tokens, None) # Advance over the dash prepend_space = "" # Combine punctuation smartly. elif re.search(r'''^[\'\"„“`]+$''', token): normalized_quo = token if re.search(r'^[„“”]+$', token): normalized_quo = '"' quote_counts[normalized_quo] = quote_counts.get( normalized_quo, 0) if self.lang == 'cs' and token == u"„": quote_counts[normalized_quo] = 0 if self.lang == 'cs' and token == u"“": quote_counts[normalized_quo] = 1 if quote_counts[normalized_quo] % 2 == 0: if (self.lang == 'en' and token == u"'" and i > 0 and re.search(r'[s]$', tokens[i - 1])): # Left shift on single quote for possessives ending # in "s", e.g. "The Jones' house" detokenized_text += token prepend_space = " " else: # Right shift. detokenized_text += prepend_space + token prepend_space = "" quote_counts[normalized_quo] += 1 else: # Left shift. detokenized_text += token prepend_space = " " quote_counts[normalized_quo] += 1 elif (self.lang == 'fi' and re.search(r':$', tokens[i - 1]) and re.search(self.FINNISH_REGEX, token)): # Finnish : without intervening space if followed by case suffix # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ... detokenized_text += prepend_space + token prepend_space = " " else: detokenized_text += prepend_space + token prepend_space = " " # Merge multiple spaces. regexp, substitution = self.ONE_SPACE detokenized_text = re.sub(regexp, substitution, detokenized_text) # Removes heading and trailing spaces. detokenized_text = detokenized_text.strip() return detokenized_text if return_str else detokenized_text.split()
def is_jp_word(word): return any([is_cjk(char) for char in word])
def scrape_html(source, trim, dest, lang): # this must be r+ mode because a+ doesn't work with beautifulsoup with open(source, "r+") as f: soup = BeautifulSoup(f, "html5lib") # removes all script and style tags for script in soup(["script", "style"]): script.decompose() # just the text text = soup.get_text() lines = [ line for line in text.split() if line != "" and any([is_cjk(char) for char in line]) ] # <i> tags eyes = [ i["title"] for i in soup.find_all("i") if i.has_attr("title") and any( is_cjk(char) for char in i["title"]) ] # tooltip attributes in spans tooltips = [ extract_cjk(s["uib-tooltip"]) for s in soup.find_all("span") if s.has_attr("uib-tooltip") and any( is_cjk(char) for char in s["uib-tooltip"]) ] lines += eyes lines += tooltips # sort the lines by length, to avoid the edge case where a short string is a substring of a longer string # and the short substring gets replaced first and breaks the longer string into 日本romaji語 that doesn't # get picked up by the regex lines = sorted(lines, key=len, reverse=True) # only take the first three words of long romaji strings romaji = { "_".join(kanji_to_romaji(line).split()[:3]): line for line in lines } # convert the entire html doc into a string to parse with regex # this is because contents breaks the document into pieces by parent elements and re.search doesn't work well soup_str = str(soup.prettify()) # the prefix for the json keys file_path = os.path.normpath(f.name) file_path = file_path.replace(os.path.splitext(file_path)[1], "") # removes a subset of directories from the filepath up to and including the trim argument if trim: trim_index = file_path.find(trim) if trim_index == -1: raise ValueError("Subdir path doesn't exist") trim_path = file_path[(trim_index + len(trim)):].replace(trim, "") split_path = trim_path.split(os.sep)[1:] camelCase = [s.title() for s in split_path[1:]] strip_punctuation = [s.replace("_", "") for s in camelCase] file_prefix = "".join([split_path[0]] + strip_punctuation) # removes everything in the path up to zcs else: split_path = file_path.split(os.sep)[1:] split_dirs = split_path[split_path.index("zcs"):] camelCase = [s.title() for s in split_dirs[1:]] strip_punctuation = [s.replace("_", "") for s in camelCase] file_prefix = "".join([split_dirs[0]] + strip_punctuation) # overwrite japanese strings with {{ romaji tags }} for key, value in romaji.items(): jp_re = re.compile(value) soup_str = jp_re.sub( f"{{{{ '{file_prefix}.{key}' | translate }}}}", soup_str) # path to save the json file to base_dir = os.path.split(f.name)[0] json_file = f"{os.path.splitext(os.path.split(f.name)[1])[0]}.{lang}.json" if dest: home = os.getcwd() output_path = os.path.join(home, dest, json_file) os.makedirs(os.path.dirname(output_path), exist_ok=True) else: # write out a json file adjacent to the html file json_path = os.path.join(base_dir, json_file) output_path = json_path with open(output_path, "w+") as rj: json.dump({file_prefix: romaji}, rj, ensure_ascii=False, indent=2) # zero out the file and replace it with the modified string in place f.seek(0) f.truncate() f.write(soup_str)
def is_valid_text(s): """ To check if the html text content is valid. We recognize a text content valid as if it contains any Japanese character. """ return any([is_cjk(ch) for ch in s])