Python is_cjkの例、nltk.tokenize.util.is_cjk Pythonの例

コード例 #1

0

ファイルを表示

ファイル: preprocess.py プロジェクト: lffloyd/reddit-topic-modelling

def is_jp_word(word):
    """Verifies if given word has japanese chars.

    Parameters:
    word (str): word to evaluate

    Returns:
    bool: wheter or not japanese chars exists in the string
    
    """
    return any([is_cjk(char) for char in word])

コード例 #2

0

ファイルを表示

def extract_cjk(mixed_string):
    """
    Takes a string with english in it and spits out only the Japanese parts
    This is mostly for the tooltip attributes

    :param mixed_string: A string with some Japanese and some other language in it
    :return: A string with only the Japanese parts
    """
    jp_only = []

    for char in mixed_string:

        if is_cjk(char):
            jp_only.append(char)

    return "".join(jp_only)

コード例 #3

0

ファイルを表示

def main(source, trim, dest, prefix, key):

    tl_dict = {}
    file_name = os.path.split(source)[1]

    with open("new_file", "w") as new_file:
        with open(source) as old_file:
            # the prefix for the json keys
            file_path = os.path.normpath(old_file.name)
            file_path = file_path.replace(os.path.splitext(file_path)[1], "")
            file_ext = os.path.splitext(file_name)[1]

            # removes a subset of directories from the filepath up to and including the trim argument
            if trim:
                trim_index = file_path.find(trim)

                if trim_index == -1:
                    raise ValueError("Subdir path doesn't exist")

                trim_path = file_path[(trim_index + len(trim)):].replace(
                    trim, "")
                split_path = trim_path.split(os.sep)[1:]
                camelCase = [s.title() for s in split_path[1:]]
                strip_punctuation = [s.replace("_", "") for s in camelCase]
                file_prefix = "".join([split_path[0]] + strip_punctuation)

            # sets the translation tag to a manually input key
            elif key:
                file_prefix = key

            # removes everything in the path up to zenclerk
            else:
                split_path = file_path.split(os.sep)[1:][-3:]
                camelCase = [s.title() for s in split_path[1:]]
                strip_punctuation = [s.replace("_", "") for s in camelCase]
                file_prefix = "".join([split_path[0]] + strip_punctuation)

            for line in old_file:

                if any([is_cjk(char) for char in line]):
                    japanese = extract_cjk(line)
                    romaji = "_".join(
                        re.sub(f'[()−]', '',
                               kanji_to_romaji(japanese)).split()[:3])
                    tl_dict[romaji] = japanese

                    i18n_tag = {
                        ".html":
                        f"{{{{ '{file_prefix}.{romaji}' | translate }}}}",
                        ".js": f"$translate.instant('{file_prefix}.{romaji}')",
                        ".rb": f"t('{romaji}')"
                    }
                    if file_ext == ".html":
                        new_file.write(
                            str(re.sub(
                                japanese,
                                i18n_tag[file_ext],
                                line,
                            )))
                    elif file_ext == ".js":
                        new_file.write(
                            str(
                                re.sub(
                                    f"'{japanese}'",
                                    i18n_tag[file_ext],
                                    line,
                                )))
                    else:
                        new_file.write(
                            str(
                                re.sub(
                                    f"'{japanese}'",
                                    i18n_tag[file_ext],
                                    line,
                                )))

                else:
                    new_file.write(line)

        shutil.move(os.path.join(os.getcwd(), "new_file"), source)

        # path to save the json file to
        base_dir = os.path.split(old_file.name)[0]
        if file_ext == '.html' or file_ext == '.js':
            i18n_file = (
                f"{os.path.splitext(os.path.split(old_file.name)[1])[0]}{prefix}.json"
            )
            if dest:
                home = os.getcwd()
                output_path = os.path.join(home, dest, i18n_file)
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
            else:
                # write out a json file adjacent to the html file
                json_path = os.path.join(base_dir, i18n_file)
                output_path = json_path

            with open(output_path, "w+") as rj:
                if key:
                    json_key = key.split('.')
                    romaji_dict = {json_key[-1]: tl_dict}

                    for k in list(reversed(json_key[:-1])):
                        romaji_dict = {k: romaji_dict}

                    json.dump(romaji_dict, rj, ensure_ascii=False, indent=2)
                else:
                    json.dump({file_prefix: tl_dict},
                              rj,
                              ensure_ascii=False,
                              indent=2)
                rj.write("\n")
        else:
            i18n_file = (
                f"{os.path.splitext(os.path.split(old_file.name)[1])[0]}{prefix}.yml"
            )

            if dest:
                home = os.getcwd()
                output_path = os.path.join(home, dest, i18n_file)
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
            else:
                # write out a json file adjacent to the html file
                json_path = os.path.join(base_dir, i18n_file)
                output_path = json_path

            with open(output_path, "w+") as rj:
                yaml.dump({file_prefix: tl_dict},
                          rj,
                          encoding='utf-8',
                          allow_unicode=True,
                          default_flow_style=False)
                rj.write("\n")

コード例 #4

0

ファイルを表示

ファイル: moses.py プロジェクト: DrDub/nltk

    def tokenize(self, tokens, return_str=False):
        """
        Python port of the Moses detokenizer.
        
        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: list(str)
        :return: str
        """
        # Convert the list of tokens into a string and pad it with spaces.
        text = u" {} ".format(" ".join(tokens))
        # Converts input string into unicode.
        text = text_type(text)
        # Detokenize the agressive hyphen split.
        regexp, subsitution = self.AGGRESSIVE_HYPHEN_SPLIT
        text = re.sub(regexp, subsitution, text)
        # Unescape the XML symbols.
        text = self.unescape_xml(text)
        # Keep track of no. of quotation marks.
        quote_counts = {u"'":0 , u'"':0}
        
        # The *prepend_space* variable is used to control the "effects" of 
        # detokenization as the function loops through the list of tokens and
        # changes the *prepend_space* accordingly as it sequentially checks 
        # through the language specific and language independent conditions. 
        prepend_space = " " 
        detokenized_text = "" 
        tokens = text.split()
        # Iterate through every token and apply language specific detokenization rule(s).
        for i, token in enumerate(iter(tokens)):
            # Check if the first char is CJK.
            if is_cjk(token[0]):
                # Perform left shift if this is a second consecutive CJK word.
                if i > 0 and is_cjk(token[-1]):
                    detokenized_text += token
                # But do nothing special if this is a CJK word that doesn't follow a CJK word
                else:
                    detokenized_text += prepend_space + token
                prepend_space = " " 
                
            # If it's a currency symbol.
            elif token in self.IsSc:
                # Perform right shift on currency and other random punctuation items
                detokenized_text += prepend_space + token
                prepend_space = ""

            elif re.match(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$', token):
                # In French, these punctuations are prefixed with a non-breakable space.
                if self.lang == 'fr' and re.match(r'^[\?\!\:\;\\\%]$', token):
                    detokenized_text += " "
                # Perform left shift on punctuation items.
                detokenized_text += token
                prepend_space = " " 
               
            elif (self.lang == 'en' and i > 0 
                  and re.match(u'^[\'][{}]'.format(self.IsAlpha), token)
                  and re.match(u'[{}]'.format(self.IsAlnum), token)):
                # For English, left-shift the contraction.
                detokenized_text += token
                prepend_space = " "
                
            elif (self.lang == 'cs' and i > 1
                  and re.match(r'^[0-9]+$', tokens[-2]) # If the previous previous token is a number.
                  and re.match(r'^[.,]$', tokens[-1]) # If previous token is a dot.
                  and re.match(r'^[0-9]+$', token)): # If the current token is a number.
                # In Czech, left-shift floats that are decimal numbers.
                detokenized_text += token
                prepend_space = " "
            
            elif (self.lang in ['fr', 'it'] and i <= len(tokens)-2
                  and re.match(u'[{}][\']$'.format(self.IsAlpha), token)
                  and re.match(u'^[{}]$'.format(self.IsAlpha), tokens[i+1])): # If the next token is alpha.
                # For French and Italian, right-shift the contraction.
                detokenized_text += prepend_space + token
                prepend_space = ""
            
            elif (self.lang == 'cs' and i <= len(tokens)-3
                  and re.match(u'[{}][\']$'.format(self.IsAlpha), token)
                  and re.match(u'^[-–]$', tokens[i+1])
                  and re.match(u'^li$|^mail.*', tokens[i+2], re.IGNORECASE)): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i)
                # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)
                detokenized_text += prepend_space + token + tokens[i+1]
                next(tokens, None) # Advance over the dash
                prepend_space = ""
                
            # Combine punctuation smartly.
            elif re.match(r'''^[\'\"„“`]+$''', token):
                normalized_quo = token
                if re.match(r'^[„“”]+$', token):
                    normalized_quo = '"'
                quote_count.get(normalized_quo, 0)
                
                if self.lang == 'cs' and token == u"„":
                    quote_count[normalized_quo] = 0
                if self.lang == 'cs' and token == u"“":
                    quote_count[normalized_quo] = 1
            
            
                if quote_count[normalized_quo] % 2 == 0:
                    if (self.lang == 'en' and token == u"'" and i > 0 
                        and re.match(r'[s]$', tokens[i-1]) ):
                        # Left shift on single quote for possessives ending
                        # in "s", e.g. "The Jones' house" 
                        detokenized_text += token
                        prepend_space = " "
                    else:
                        # Right shift.
                        detokenized_text += prepend_space + token
                        prepend_space = ""
                        quote_count[normalized_quo] += 1
                else:
                    # Left shift.
                    text += token
                    prepend_space = " "
                    quote_count[normalized_quo] += 1
            
            elif (self.lang == 'fi' and re.match(r':$', tokens[i-1])
                  and re.match(self.FINNISH_REGEX, token)):
                # Finnish : without intervening space if followed by case suffix
                # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
                detokenized_text += prepend_space + token
                prepend_space = " "
            
            else:
                detokenized_text += prepend_space + token
                prepend_space = " "
                
        # Merge multiple spaces.
        regexp, subsitution = self.ONE_SPACE
        detokenized_text = re.sub(regexp, subsitution, detokenized_text)
        # Removes heading and trailing spaces.
        detokenized_text = detokenized_text.strip()
    
        return detokenized_text if return_str else detokenized_text.split()

コード例 #5

0

ファイルを表示

    def tokenize(self, tokens, return_str=False, unescape=True):
        """

        Python port of the Moses detokenizer.

        :param tokens: A list of strings, i.e. tokenized text.

        :type tokens: list(str)

        :return: str

        """

        # Convert the list of tokens into a string and pad it with spaces.

        text = u" {} ".format(" ".join(tokens))

        # Converts input string into unicode.

        text = text_type(text)

        # Detokenize the agressive hyphen split.

        regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT

        text = re.sub(regexp, substitution, text)

        if unescape:

            # Unescape the XML symbols.

            text = self.unescape_xml(text)

        # Keep track of no. of quotation marks.

        quote_counts = {u"'": 0, u'"': 0, u"``": 0, u"`": 0, u"''": 0}

        # The *prepend_space* variable is used to control the "effects" of

        # detokenization as the function loops through the list of tokens and

        # changes the *prepend_space* accordingly as it sequentially checks

        # through the language specific and language independent conditions.

        prepend_space = " "

        detokenized_text = ""

        tokens = text.split()

        # Iterate through every token and apply language specific detokenization rule(s).

        for i, token in enumerate(iter(tokens)):

            # Check if the first char is CJK.

            if is_cjk(token[0]):

                # Perform left shift if this is a second consecutive CJK word.

                if i > 0 and is_cjk(token[-1]):

                    detokenized_text += token

                # But do nothing special if this is a CJK word that doesn't follow a CJK word

                else:

                    detokenized_text += prepend_space + token

                prepend_space = " "

            # If it's a currency symbol.

            elif token in self.IsSc:

                # Perform right shift on currency and other random punctuation items

                detokenized_text += prepend_space + token

                prepend_space = ""

            elif re.search(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$', token):

                # In French, these punctuations are prefixed with a non-breakable space.

                if self.lang == 'fr' and re.search(r'^[\?\!\:\;\\\%]$', token):

                    detokenized_text += " "

                # Perform left shift on punctuation items.

                detokenized_text += token

                prepend_space = " "

            elif (self.lang == 'en' and i > 0
                  and re.search(u"^[\'][{}]".format(self.IsAlpha), token)):

                # and re.search(u'[{}]$'.format(self.IsAlnum), tokens[i-1])):

                # For English, left-shift the contraction.

                detokenized_text += token

                prepend_space = " "

            elif (self.lang == 'cs' and i > 1 and re.search(
                    r'^[0-9]+$',
                    tokens[-2])  # If the previous previous token is a number.
                  and re.search(r'^[.,]$',
                                tokens[-1])  # If previous token is a dot.
                  and re.search(r'^[0-9]+$',
                                token)):  # If the current token is a number.

                # In Czech, left-shift floats that are decimal numbers.

                detokenized_text += token

                prepend_space = " "

            elif (self.lang in ['fr', 'it', 'ga'] and i <= len(tokens) - 2
                  and re.search(u'[{}][\']$'.format(self.IsAlpha), token)
                  and re.search(u'^[{}]$'.format(self.IsAlpha),
                                tokens[i + 1])):  # If the next token is alpha.

                # For French and Italian, right-shift the contraction.

                detokenized_text += prepend_space + token

                prepend_space = ""

            elif (self.lang == 'cs' and i <= len(tokens) - 3
                  and re.search(u'[{}][\']$'.format(self.IsAlpha), token)
                  and re.search(u'^[-–]$', tokens[i + 1])
                  and re.search(u'^li$|^mail.*', tokens[i + 2], re.IGNORECASE)
                  ):  # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i)

                # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)

                detokenized_text += prepend_space + token + tokens[i + 1]

                next(tokens, None)  # Advance over the dash

                prepend_space = ""

            # Combine punctuation smartly.

            elif re.search(r'''^[\'\"„“`]+$''', token):

                normalized_quo = token

                if re.search(r'^[„“”]+$', token):

                    normalized_quo = '"'

                quote_counts[normalized_quo] = quote_counts.get(
                    normalized_quo, 0)

                if self.lang == 'cs' and token == u"„":

                    quote_counts[normalized_quo] = 0

                if self.lang == 'cs' and token == u"“":

                    quote_counts[normalized_quo] = 1

                if quote_counts[normalized_quo] % 2 == 0:

                    if (self.lang == 'en' and token == u"'" and i > 0
                            and re.search(r'[s]$', tokens[i - 1])):

                        # Left shift on single quote for possessives ending

                        # in "s", e.g. "The Jones' house"

                        detokenized_text += token

                        prepend_space = " "

                    else:

                        # Right shift.

                        detokenized_text += prepend_space + token

                        prepend_space = ""

                        quote_counts[normalized_quo] += 1

                else:

                    # Left shift.

                    detokenized_text += token

                    prepend_space = " "

                    quote_counts[normalized_quo] += 1

            elif (self.lang == 'fi' and re.search(r':$', tokens[i - 1])
                  and re.search(self.FINNISH_REGEX, token)):

                # Finnish : without intervening space if followed by case suffix

                # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...

                detokenized_text += prepend_space + token

                prepend_space = " "

            else:

                detokenized_text += prepend_space + token

                prepend_space = " "

        # Merge multiple spaces.

        regexp, substitution = self.ONE_SPACE

        detokenized_text = re.sub(regexp, substitution, detokenized_text)

        # Removes heading and trailing spaces.

        detokenized_text = detokenized_text.strip()

        return detokenized_text if return_str else detokenized_text.split()

コード例 #6

0

ファイルを表示

def is_jp_word(word):
    return any([is_cjk(char) for char in word])

コード例 #7

0

ファイルを表示

def scrape_html(source, trim, dest, lang):

    # this must be r+ mode because a+ doesn't work with beautifulsoup
    with open(source, "r+") as f:

        soup = BeautifulSoup(f, "html5lib")

        # removes all script and style tags
        for script in soup(["script", "style"]):
            script.decompose()

        # just the text
        text = soup.get_text()
        lines = [
            line for line in text.split()
            if line != "" and any([is_cjk(char) for char in line])
        ]

        # <i> tags
        eyes = [
            i["title"] for i in soup.find_all("i")
            if i.has_attr("title") and any(
                is_cjk(char) for char in i["title"])
        ]

        # tooltip attributes in spans
        tooltips = [
            extract_cjk(s["uib-tooltip"]) for s in soup.find_all("span")
            if s.has_attr("uib-tooltip") and any(
                is_cjk(char) for char in s["uib-tooltip"])
        ]

        lines += eyes
        lines += tooltips

        # sort the lines by length, to avoid the edge case where a short string is a substring of a longer string
        # and the short substring gets replaced first and breaks the longer string into 日本romaji語 that doesn't
        # get picked up by the regex
        lines = sorted(lines, key=len, reverse=True)

        # only take the first three words of long romaji strings
        romaji = {
            "_".join(kanji_to_romaji(line).split()[:3]): line
            for line in lines
        }

        # convert the entire html doc into a string to parse with regex
        # this is because contents breaks the document into pieces by parent elements and re.search doesn't work well
        soup_str = str(soup.prettify())

        # the prefix for the json keys
        file_path = os.path.normpath(f.name)
        file_path = file_path.replace(os.path.splitext(file_path)[1], "")

        # removes a subset of directories from the filepath up to and including the trim argument
        if trim:
            trim_index = file_path.find(trim)

            if trim_index == -1:
                raise ValueError("Subdir path doesn't exist")

            trim_path = file_path[(trim_index + len(trim)):].replace(trim, "")
            split_path = trim_path.split(os.sep)[1:]
            camelCase = [s.title() for s in split_path[1:]]
            strip_punctuation = [s.replace("_", "") for s in camelCase]
            file_prefix = "".join([split_path[0]] + strip_punctuation)

        # removes everything in the path up to zcs
        else:
            split_path = file_path.split(os.sep)[1:]
            split_dirs = split_path[split_path.index("zcs"):]
            camelCase = [s.title() for s in split_dirs[1:]]
            strip_punctuation = [s.replace("_", "") for s in camelCase]
            file_prefix = "".join([split_dirs[0]] + strip_punctuation)

        # overwrite japanese strings with {{ romaji tags }}
        for key, value in romaji.items():
            jp_re = re.compile(value)
            soup_str = jp_re.sub(
                f"{{{{ '{file_prefix}.{key}' | translate }}}}", soup_str)

        # path to save the json file to
        base_dir = os.path.split(f.name)[0]
        json_file = f"{os.path.splitext(os.path.split(f.name)[1])[0]}.{lang}.json"

        if dest:
            home = os.getcwd()
            output_path = os.path.join(home, dest, json_file)
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
        else:
            # write out a json file adjacent to the html file
            json_path = os.path.join(base_dir, json_file)
            output_path = json_path

        with open(output_path, "w+") as rj:
            json.dump({file_prefix: romaji}, rj, ensure_ascii=False, indent=2)

        # zero out the file and replace it with the modified string in place
        f.seek(0)
        f.truncate()
        f.write(soup_str)

コード例 #8

0

ファイルを表示

def is_valid_text(s):
    """
    To check if the html text content is valid. We recognize a text content valid as if it contains any Japanese character.
    """
    return any([is_cjk(ch) for ch in s])