Ejemplo n.º 1
0
def clean_personachat_text(text):
    text = standardize_english_text(text)
    text = re.sub(r"(\w)n ' (t\W)", r"\1 n'\2", text)
    text = re.sub(r" ' (m|s|re|ve|d|ll)(\W)", r" '\1\2", text)
    return text
Ejemplo n.º 2
0
 def tokenize(string):
     return [token.text for token in nlp(standardize_english_text(string))]
Ejemplo n.º 3
0
def clean_swda_text(text):
    text = re.sub(r"\{\w (.*?)\}", r"\1", text)
    text = re.sub(r"\{\w (.*?)\ --", r"\1", text)
    text = re.sub(r"\*\[\[.*?\]\]", "", text)
    text = standardize_english_text(text)
    return text
def clean_cornellmovie_text(text):
    text = text.replace("<u>", "")
    text = text.replace("</u>", "")
    text = standardize_english_text(text)
    return text