Ejemplo n.º 1
0
def text_to_wordlist(text, remove_stopwords=False, stem_words=False,comma=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    import re
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    if not comma:
        text = re.sub(r"[,:\/\^.$%#+-></\?\=*\\]", " ", text) # origin is [^A-Za-z0-9^,!.\/'+-=?]
    else:
        pass
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return text
Ejemplo n.º 2
0
 def clean_text(text):
     text = text.lower()
     text = REPLACE_BY_SPACE.sub(' ', text)
     text = BAD_SYMBOLS.sub(' ', text)
     text = re.sub(r"\'s", " ", text)
     text = ' '.join(word for word in text.split() if word not in STOPWORDS)
     return text
Ejemplo n.º 3
0
def prepare_for_char_n_gram(text):
    """ Simple text clean up process"""
    # 1. Go to lower case (only good for english)
    # Go to bytes_strings as I had issues removing all \n in r""
    clean = bytes(text.lower(), encoding="utf-8")
    # 2. Drop \n and  \t
    clean = clean.replace(b"\n", b" ")
    clean = clean.replace(b"\t", b" ")
    clean = clean.replace(b"\b", b" ")
    clean = clean.replace(b"\r", b" ")
    # 3. Replace english contractions
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    # 4. Drop puntuation
    # I could have used regex package with regex.sub(b"\p{P}", " ")
    exclude = re.compile(
        b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
    clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
    # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-)
    clean = re.sub(b"\d+", b" ", clean)
    # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences
    clean = re.sub(b'\s+', b' ', clean)
    # Remove ending space if any
    clean = re.sub(b'\s+$', b'', clean)
    # 7. Now replace words by words surrounded by # signs
    # e.g. my name is bond would become #my# #name# #is# #bond#
    # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
    clean = re.sub(b" ", b"# #", clean)  # Replace space
    clean = b"#" + clean + b"#"  # add leading and trailing #

    return str(clean, 'utf-8')
Ejemplo n.º 4
0
def tokenize(text):

    # remove urls
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    #tokenize
    tokens = word_tokenize(text)

    # Remove stop words
    tokens = [w for w in tokens if w not in stopwords.words("english")]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).strip()
        clean_tokens.append(clean_tok)

    return ' '.join(clean_tokens)
Ejemplo n.º 5
0
def _remove_pattern(input_text_list):
    PATTERN_AT = "@[\w]*"
    HTTPS_LINKS = "https://[\w]*\.[\w]*/[\w\d]{,12}"
    NUMBER_SIGN = "#"
    USELESS_CHAR = "[^\w\s0-9\.,:;?!-_'\"\(\)\*]"
    # EMOJI_CHAR = "[\U00010000 -\U0010ffff\uD800 -\uDBFF\uDC00 -\uDFFF]"

    cleaned_text_list = []
    for text in input_text_list:
        text = re.sub(PATTERN_AT, "", text)  # 去掉@user
        text = re.sub(HTTPS_LINKS, "", text)  # 去掉https链接
        text = re.sub(NUMBER_SIGN, "", text)  # 井号的英文是number_sign,去掉井号#
        text = re.sub(USELESS_CHAR, "",
                      text)  #去掉mian_char以外的字符,包括奇怪的字符、emoji等,留下字母、数字、主要标点符号

        text = re.sub("\.", " .", text)  # 将标点符号和单词分离,单独作为一个符号
        text = re.sub(",", " ,", text)
        text = re.sub(":", " :", text)
        text = re.sub(";", " ;", text)
        text = re.sub("\?", " ?", text)
        text = re.sub("!", " !", text)
        text = re.sub("-", " -", text)
        text = re.sub("_", " _", text)
        text = re.sub("'", " '", text)
        text = re.sub("\"", " \" ", text)
        text = re.sub("\(", "", text)
        text = re.sub("\)", "", text)
        text = re.sub("\*", " * ", text)

        text = text.strip()
        text = text.lower()
        cleaned_text_list.append(text)
    return cleaned_text_list
Ejemplo n.º 6
0
def clean_text(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"n't", "not", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\/", " / ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r"\0s", " 0 ", text)
    text = re.sub(r" 9 11 ", " 911 ", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r" j k ", " jk ", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text
Ejemplo n.º 7
0
def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True, split=" "):
    """Converts a text to a sequence of words (or tokens).
    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out.
        lower: Whether to convert the input to lowercase.
        split: Sentence split marker (string).
    # Returns
        A list of words (or tokens).
    """
    if lower:
        text = text.lower()

    if sys.version_info < (3,) and isinstance(text, unicode):
        translate_map = dict((ord(c), unicode(split)) for c in filters)
    else:
        translate_map = maketrans(filters, split * len(filters))

    text = text.translate(translate_map)
    #seq = text.split(split)
    seq = text.split()
    
    #seq = word_tokenize(text)
    #print("text:",seq)
    
    #pos_seq = nltk.pos_tag(text)
    #return [i for i in seq if i]
    return nltk.pos_tag(seq)
Ejemplo n.º 8
0
def clean_text(text):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    #
    return text
def process_sentence(text, objects='False'):
    '''
    Simple and dirty text preprocessing to remove some mispelled words 
    and lemmatize
    '''
    text = text.lower()
    old_text = text
    
    text = text.replace('1', 'one').replace('2','two').replace(
        '3','three').replace('4','four').replace('5','five').replace('6','six').replace(
        '.','').replace('contains', 'contain').replace(
        'which','').replace('are there','there are').replace(
        'there is', '').replace('ablue', 'a blue').replace(
        'corner','edge').replace('wall', 'edge').replace('yelow', 'yellow').replace(
        'below','beneath').replace(
        'brick','block').replace('leats','least').replace('is touching', 'touching')
    text = re.sub(r'colour([\W])', 'color ', text)
    text = re.sub(r'colored([\W])', 'color ', text)
    text = re.sub(r'coloured([\W])', 'color ', text)
    text = text.split(' ')
    text = map(correction, [t for t in text if t])
    text = [lemmatizer.lemmatize(x) if not x in [u'as',u'wall'] else x for x in text]
    text = ' '.join(text)
    if 'that' in text:
        text = text.replace('that', '')
    if 'contain' in text or 'ha ' in text:
        text = text.replace('contain', 'with').replace('ha ','with ')
    text = re.sub(r'(^|\W)a([\W])', ' one ', text)
    text = re.sub(r'(^)ll ', ' ', text)
    text = re.sub(r'(^)t ', 'at ', text)
    text = ' '.join([t for t in text.split(' ') if t])
    text = text.replace('based', 'base')
    return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)
Ejemplo n.º 11
0
def preprocess(text):

    text = text.lower()
    list_happen = [
        "😊", "❤️", "😁", "😄", "😆", "😍", "🤣", "😂", "🤩", "😚", "😋", '😜', "😝", "🤗",
        ":)", ":}", "^^", ";)", "👌", "=))", "😅", "👍", "👍🏻", "💕", "❤", "👏", "💟",
        "<3", ":D", ":P", "^_^", "😉", "✌️"
    ]
    list_sad = [
        "😡", "🤔", "🤨", "😐", "😏", "😒", "😶", "🙄", "😌", "😔", "🤕", "🤒", "👿", "🤬",
        "😤", '😫', "😩", "😭", ":(", "😈", "-_-", "👎"
    ]
    for happen in list_happen:
        text = text.replace(happen, " vui")
    for sad in list_sad:
        text = text.replace(sad, " buồn")

    text = re.sub(
        '[\n!,.?@#?!.,#$%\()*+-/:;<=>@[\\]^_`{|}~`"""“”’∞θ÷α•−β∅³π‘₹´°£€\×™√²—–&]',
        '', text)
    #     text = preprocess1(text)
    text = ViTokenizer.tokenize(text)
    # emoticons = re.findall(r"(?:|;|=)(?:-)?(?:\)\(|D|P)", text)
    # text = re.sub(r"[\W]+", " ", text.lower()) + " ".join(emoticons).replace('-', '')
    # text = re.sub("\n", ' ', text)
    return text
def text_to_word_list(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", "  ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", "  ", text)
    text = re.sub(r"\+", "  ", text)
    text = re.sub(r"\-", "  ", text)
    text = re.sub(r"\=", "  ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r">", "  ", text)
    text = re.sub(r"<", "  ", text)
    text = re.sub(r"--", "  ", text)
    text = re.sub(r"`", "  ", text)
    #text = re.sub(r"...", "  ", text)
    text = re.sub(r"#", "  ", text)
    text = re.sub(r"@", "  ", text)
    text = re.sub(r"$", "  ", text)
    text = re.sub(r"%", "  ", text)
    text = re.sub(r"&", "  ", text)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
    text = re.sub(r"\b\d+(?:\.\d+)?\s+", "", text)
    #text = re.sub(r"*", "  ", text)
    #text = re.sub(r"[", "  ", text)
    #text = re.sub(r"]", "  ", text)
    #text = re.sub(r"(", "  ", text)
    #text = re.sub(r")", "  ", text)
    text = re.sub(r"_", "  ", text)
    text = re.sub(r"\\", "  ", text)
    text = re.sub(r"~", "  ", text)
    #print(text)
    text = re.sub(r"(\d+)(k)", r"", text)
    text = re.sub(r":", "  ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    #text = text.split()
    text_list = []
    for word in nltk.word_tokenize(text):
        case = get_word(word)
        if case:
            text_list.append(word)
Ejemplo n.º 13
0
def text_to_wordlist(text, remove_stopwords=True, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9']", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)

    text = re.sub(r"\s{2,}", " ", text)

    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r" uk ", " england ", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r" dms ", "direct messages ", text)
    text = re.sub(r"demonitization", "demonetization", text)
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text)
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iphone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"iii", "3", text)
    text = re.sub(r"the us", "america", text)
    text = re.sub(r" j k ", " jk ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)
Ejemplo n.º 14
0
def _remove_pattern_2(input_text_list):
    stoplist = read_stopwords()

    cleaned_text_list = []
    for text in input_text_list:
        text = text.translate(string.punctuation)  # Remove puncuation 去除标点
        text = text.lower()  # Convert words to lower case and split them

        # text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",
                      text)  # 除A-Za-z0-9(),!?'`外的字符,去除
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text)

        text = text.split()

        text = [word for word in text
                if word not in stoplist]  ## 在提取词根前清除一次停用词

        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]

        cleanwordlist = [
            word for word in stemmed_words if word not in stoplist
        ]  ## 提取词根后,再清除

        text = " ".join(cleanwordlist)

        cleaned_text_list.append(text)
    return cleaned_text_list
Ejemplo n.º 15
0
def clean(text):
    text = text.lower()  # lowercase text
    text = re.sub('[0-9]+', '', text)
    text = BeautifulSoup(text, "html.parser").text  # HTML decoding
    text = replace_with_space.sub(
        ' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = bad_symbols.sub(
        '', text)  # delete symbols which are in BAD_SYMBOLS_RE from text
    return text  #Write function to conduct all text cleaning
Ejemplo n.º 16
0
def clean_text(text):
    text = remove_urls(text)
    text = remove_users(text.replace('@ ', '@'))
    # text = remove_hash_tags(text)    
    text = re.sub(r'\s+', ' ', text).strip()    
    text = filter(lambda x: x in printable, text)    
    text = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()
Ejemplo n.º 17
0
def bigram_text_to_word_sequence(text, bigram, filters=base_filter(), lower=False, split=" "):
    '''prune: sequence of characters to filter out
    '''
    if lower:
        text = text.lower()
    text = text.translate(string.maketrans(filters, split*len(filters)))
    seq = text.split(split)
    sentences = [_f for _f in seq if _f]
    return bigram(sentences)
def clean_text(text):
    cleaned = text.lower()
    punctuations = string.punctuation
    cleaned = "".join(c for c in cleaned if c not in punctuations)
    words = cleaned.split()
    stopwords_lists = stopwords.words("english")
    cleaned = [word for word in words if word not in stopwords_lists]
    cleaned = [lem.lemmatize(word, "v") for word in cleaned]
    cleaned = [lem.lemmatize(word, "n") for word in cleaned]
    cleaned = " ".join(cleaned)
    return cleaned
Ejemplo n.º 19
0
def preprocess(text):
    text = text.lower()
    doc = ' '.join(re.findall(r"[\w']+|[.,!?;/-]", text))
    #print doc
    doc = word_tokenize(doc)
    #doc = keras.preprocessing.text.Tokenizer(num_words=None,lower=True, split=" ").fit_on_texts(doc)
    #print doc
    #doc = [word for word in doc if word.isalpha()]
    #doc = [word for word in doc if word not in stop_words]
    #print doc
    return doc
def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True, split=" "):
    if lower: text = text.lower()
    if type(text) == unicode:
        translate_table = {ord(c): ord(t) for c, t in zip(filters, split * len(filters))}
    else:
        translate_table = maketrans(filters, split * len(filters))
    text = text.translate(translate_table)
    seq = text.split(split)
    return [i for i in seq if i]
Ejemplo n.º 21
0
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
Ejemplo n.º 22
0
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in stopwords) # delete stopwors from text
    return text
def removeStopWords(sen):
    text = re.sub('[^a-zA-z&]', ' ', sen)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [
        ps.stem(word) for word in text
        if not word in set(stopwords.words('english'))
    ]
    text = ' '.join(text)
    return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e ?-? ?mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)
Ejemplo n.º 25
0
def clean_text(text):
    text = re.sub(r'\w*\d\w*', '',
                  text).strip()  # removes all words that contains numbers
    text = text.lower()  # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(
        ' ', text
    )  # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub(
        '', text
    )  # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
    text = re.sub('\d+', '', text)
    return text
Ejemplo n.º 26
0
def preprocess(text):
    # text = re.sub('[\n!,.?@#]', '', text)
    text = text.lower()
    list_happen = ["😊","❤️","😁","😄","😆","😍","🤣","😂","🤩","😚","😋",'😜',"😝","🤗",":)",":}","^^",";)",
    "👌","=))","😅","👍","👍🏻","💕","❤","👏","💟","<3",":D",":P","^_^","😉","✌️"]
    list_sad = ["😡","🤔","🤨","😐","😏","😒","😶","🙄","😌","😔","🤕","🤒","👿","🤬","😤",'😫',"😩","😭",":(","😈","-_-","👎"]
    for happen in list_happen:          
        text = text.replace(happen, "vui")
    for sad in list_sad:          
        text = text.replace(sad, "tệ")
    # text = ViTokenizer.tokenize(text)
    return text
Ejemplo n.º 27
0
def _remove_pattern_2(input_text_list):

    cleaned_text_list = []
    for text in input_text_list:
        text = text.translate(string.punctuation)  # Remove puncuation 去除标点
        text = text.lower()  # Convert words to lower case and split them

        # Remove stop words
        # text = text.split()
        # stops = set(stopwords.words("english"))
        # text = [w for w in text if not w in stops and len(w) >= 3]

        # text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",
                      text)  # 除A-Za-z0-9(),!?'`外的字符,去除
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)

        # text = text.split()
        # stemmer = SnowballStemmer('english')
        # stemmed_words = [stemmer.stem(word) for word in text]
        # text = " ".join(stemmed_words)

        cleaned_text_list.append(text)
    return cleaned_text_list
Ejemplo n.º 28
0
def preprocessing(text):
    text = text.lower()
    text = re.sub(r'(\&lt)|(\&gt)', ' ', text)

    text = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        ' url ', text)
    text = kp.replace_keywords(text)
    text = clean_punct(text)
    text = re.sub(r'\n\r', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)

    return text
    def _preprocess(self, text):
        # lower
        if self.lower:
            text = text.lower()

        # remove special chars
        if self.initial_filters is not None:
            text = re.sub(self.initial_filters, ' ', text)

        # fuuuuck => f**k
        if self.remove_repetitions:
            pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
            text = pattern.sub(r"\1", text)
        return text
Ejemplo n.º 30
0
def cleanser(questions):

    for i, text in enumerate(questions):
        text = text.lower()
        # Optionally, remove stop words
        #stops = set(stopwords.words("english"))
        #text = [w for w in text if not w in stops]

        #text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)

        # Optionally, shorten words to their stems
        #text = text.split()
        #stemmer = SnowballStemmer('english')
        #stemmed_words = [stemmer.stem(word) for word in text]

        questions[i] = text

    return questions