def preprocessing_two_class(tweet):
    tweet = ' '.join(emoji.str2emoji(tweet.split()))
    tweets = text_processor.pre_process_doc(tweet)
    tweets = emoji.str2emoji(tweets)
    tweets = [
        lemmatizer.lemmatize(word, grammar[0].lower()) if grammar[0].lower()
        in ['a', 'n', 'v'] else lemmatizer.lemmatize(word)
        for word, grammar in pos_tag(tweets)
    ]
    tweets = [
        tweet for tweet in tweets
        if (tweet not in punctuation) and (tweet not in stopwords)
    ]
    tweet = ' '.join(tweets)
    return tweet
Esempio n. 2
0
def standardization2(tweet):
    tweet = re.sub(r"\\u2019", "'", tweet)
    tweet = re.sub(r"\\u002c", "'", tweet)
    tweet = re.sub(r" [0-9]+ ", " ", tweet)
    tweet = re.sub(r"RT ", "", tweet)
    tweets = T.tokenize(tweet)
    tweets = emoji.str2emoji(tweets)
    tweets = [
        lemmatizer.lemmatize(word, grammar[0].lower()) if grammar[0].lower()
        in ['a', 'n', 'v'] else lemmatizer.lemmatize(word)
        for word, grammar in pos_tag(tweets)
    ]
    tweets = [
        tweet for tweet in tweets
        if (tweet not in punctuation) and (tweet not in stopwords)
    ]
    tweets = list(filter(lambda x: x.count('.') < 4, tweets))
    return tweets
Esempio n. 3
0
def standardization(tweet):
	tweet = re.sub(r"\\u2019", "'", tweet)
	tweet = re.sub(r"\\u002c", "'", tweet)
	tweet=' '.join(emoji.str2emoji(unidecode(tweet).lower().split()))
	tweet = re.sub(r"(http|https)?:\/\/[a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4}(/\S*)?", " ", tweet)
	tweet = re.sub(r"\'ve", " have", tweet)
	tweet = re.sub(r" can\'t", " cannot", tweet)
	tweet = re.sub(r"n\'t", " not", tweet)
	tweet = re.sub(r"\'re", " are", tweet)
	tweet = re.sub(r"\'d", " would", tweet)
	tweet = re.sub(r"\'ll", " will", tweet)
	tweet = re.sub(r"\'s", "", tweet)
	tweet = re.sub(r"\'n", "", tweet)
	tweet = re.sub(r"\'m", " am", tweet)
	tweet = re.sub(r"@\w+", r' ',tweet)
	tweet = re.sub(r"#\w+", r' ',tweet)
	tweet = re.sub(r" [0-9]+ "," ",tweet)
	tweet = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v']  else lemmatizer.lemmatize(i) for i,j in pos_tag(tknzr.tokenize(tweet))]
	tweet = [ i for i in tweet if (i not in stopwords) and (i not in punctuation ) ]
	tweet = ' '.join(tweet)
	return tweet
Esempio n. 4
0
def standardization_teacher(tweet):
    tweet = re.sub(r"\\u2019", "'", tweet)
    tweet = re.sub(r"\\u002c", ",", tweet)
    tweet = emoji.str2emoji(tweet)
    tweet = re.sub(r"(http|https)?:\/\/[a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4}(/\S*)?",
                   " ", tweet)
    tweet = re.sub(r"u r ", " you are ", tweet)
    tweet = re.sub(r"U r ", " you are ", tweet)
    tweet = re.sub(r" u(\s|$)", " you ", tweet)
    tweet = re.sub(r"didnt", "did not", tweet)
    tweet = re.sub(r"\'ve", " have", tweet)
    tweet = re.sub(r" can\'t", " cannot", tweet)
    tweet = re.sub(r"n\'t", " not", tweet)
    tweet = re.sub(r"\'re", " are", tweet)
    tweet = re.sub(r"\'d", " would", tweet)
    tweet = re.sub(r"\'ll", " will", tweet)
    tweet = re.sub(r"\'s", "", tweet)
    tweet = re.sub(r"\'n", "", tweet)
    tweet = re.sub(r"\'m", " am", tweet)
    tweet = re.sub(r"@\w+", r' ', tweet)
    tweet = re.sub(r"#\w+", r' ', tweet)
    tweet = re.sub(r" [0-9]+ ", " ", tweet)
    tweet = re.sub(r" plz[\s|$]", " please ", tweet)
    tweet = re.sub(
        r"^([1-9] |1[0-9]| 2[0-9]|3[0-1])(.|-)([1-9] |1[0-2])(.|-|)20[0-9][0-9]",
        " ", tweet)
    tweet = [
        lemmatizer.lemmatize(i, j[0].lower())
        if j[0].lower() in ['a', 'n', 'v'] else lemmatizer.lemmatize(i)
        for i, j in pos_tag(tknzr.tokenize(tweet))
    ]
    tweet = [
        i for i in tweet if (i not in stopwords) and (i not in punctuation)
    ]
    tweet = ' '.join(tweet)
    return tweet.lower()