def tokenize_python_string(s):
    s = s.replace("a", "aa")
    s = s.replace(" ", " ab ")
    s = s.replace("  ", " ")
    s = sacrebleu.tokenize_v14_international(s)

    # "ab" -> " ", "aa" -> "a", "\ x" -> "\x"
    psuedo_tokens = s.split(" ")
    tokens = []
    escaped = False
    for t in psuedo_tokens:
        if t == "":
            continue

        real_t = SpecialToken.STR_SPACE if t == "ab" else t.replace("aa", "a")
        if escaped:
            tokens.append("\\" + real_t)
            escaped = False
        elif real_t == "\\":
            escaped = True
        else:
            tokens.append(real_t)
            escaped = False

    return tokens
Exemple #2
0
def normalize(sentence,
              lowercase: bool = True,
              tokenizer: str = '13a',
              return_str: bool = True):
    if lowercase:
        sentence = sentence.lower()

    if tokenizer == "13a":
        normalized_sent = sacrebleu.tokenize_13a(sentence)
    elif tokenizer == "intl":
        normalized_sent = sacrebleu.tokenize_v14_international(sentence)
    elif tokenizer == "moses":
        normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence,
                                                               return_str=True,
                                                               escape=False)
    elif tokenizer == "penn":
        normalized_sent = sacremoses.MosesTokenizer().penn_tokenize(
            sentence, return_str=True)
    else:
        normalized_sent = sentence

    if not return_str:
        normalized_sent = normalized_sent.split()

    return normalized_sent
Exemple #3
0
 def tokenize_line(cls, line: str, tokenization: VizSeqTokenization) -> str:
     if tokenization == VizSeqTokenization.none:
         return line
     elif tokenization == VizSeqTokenization.mteval_13a:
         return tokenize_13a(line)
     elif tokenization == VizSeqTokenization.mteval_v14_international:
         return tokenize_v14_international(line)
     elif tokenization == VizSeqTokenization.zh:
         return tokenize_zh(line)
     elif tokenization == VizSeqTokenization.char:
         return _tokenize_by_char(line)
     else:
         raise ValueError(f'Unknown tokenization {tokenization.name}')
def process_string(tok, char2tok, tok2char, is_comment):
    if is_comment:
        tok = re.sub(' +', ' ', tok)
        tok = re.sub(r"(.)\1\1\1\1+", r"\1\1\1\1\1", tok)
        if len(re.sub(r'\W', '', tok)) < 2:
            return ''
    tok = tok.replace(' ', ' SPACETOKEN ')
    for char, special_token in char2tok.items():
        tok = tok.replace(char, special_token)
    if tok.startswith(' STOKEN0'):
        if tok.endswith('\n'):
            tok = tok[:-1]
        tok += ' ENDCOM'
    tok = tok.replace('\n', ' STRNEWLINE ')
    tok = tok.replace('\t', ' TABSYMBOL ')
    tok = re.sub(' +', ' ', tok)
    tok = tokenize_v14_international(tok)
    tok = re.sub(' +', ' ', tok)
    for special_token, char in tok2char.items():
        tok = tok.replace(special_token, char)
    tok = tok.replace('\r', '')
    return tok
Exemple #5
0
def process_string(tok,
                   char2tok,
                   tok2char,
                   is_comment,
                   do_whole_processing=True):
    if not (do_whole_processing or is_comment):
        return tok.replace("\n", "\\n").replace("\r", "")

    if is_comment:
        tok = re.sub(" +", " ", tok)
        tok = re.sub(r"(.)\1\1\1\1+", r"\1\1\1\1\1", tok)
        if len(re.sub(r"\W", "", tok)) < 2:
            return ""
    tok = replace_general_string_tok(tok)
    tok = replace_tokens(tok, char2tok)
    if tok.strip().startswith("STOKEN00"):
        if " STRNEWLINE " in tok:
            tok = tok.replace(" STRNEWLINE ", " ENDCOM", 1)
        else:
            tok += " ENDCOM"
    if not do_whole_processing:
        tok = replace_tokens(
            tok, {f" {key} ": value
                  for key, value in tok2char.items()})
        tok = (tok.replace(" ▁ ", " ").replace(" TABSYMBOL ", "\t").replace(
            "\\r", "").replace(" STRNEWLINE ", "\\n"))
        return tok

    tok = re.sub(" +", " ", tok)
    tok = tokenize_v14_international(tok)
    tok = re.sub(" +", " ", tok)
    tok = tok.replace("\r", "")
    for special_token, char in tok2char.items():
        tok = tok.replace(special_token, char)
    if tok[0].isalpha():
        # for special strings, (e.g. L "s" we should remove the space after L)
        tok = tok.replace(f"{tok[0]} ", tok[0])
    return tok
Exemple #6
0
def process_string(tok, char2tok, tok2char, is_comment):
    if is_comment:
        tok = re.sub(" +", " ", tok)
        tok = re.sub(r"(.)\1\1\1\1+", r"\1\1\1\1\1", tok)
        if len(re.sub(r"\W", "", tok)) < 2:
            return ""
    tok = tok.replace(" ", " ▁ ")
    for char, special_token in char2tok.items():
        tok = tok.replace(char, special_token)
    if tok.startswith(" STOKEN0"):
        if tok.endswith("\n"):
            tok = tok[:-1]
        tok += " ENDCOM"
    tok = tok.replace("\n", " STRNEWLINE ")
    tok = tok.replace("\t", " TABSYMBOL ")
    tok = re.sub(" +", " ", tok)
    tok = tokenize_v14_international(tok)
    tok = re.sub(" +", " ", tok)
    for special_token, char in tok2char.items():
        tok = tok.replace(special_token, char)
    tok = tok.replace("\r", "")

    return tok