Exemple #1
0
 def cleanData(self, sentences):
     regExp = ExpandContractions()
     sentences = sentences.map(lambda x: regExp.expandContractions(x))
     replacer = RepeatReplacer()
     sentences = sentences.map(lambda x: replacer.replace(x))
     sentences = sentences.map(lambda x: x.encode('ascii', 'ignore'))
     normalize = NormalizeWords()
     sentences = sentences.map(
         lambda x: normalize.normalizeWords(x.decode("utf-8")))
     joinData = JoinData()
     sentences = sentences.map(lambda x: joinData.join(x))
     ## reference:https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
     # We are tagging the tokens with its respective POS using Lemmatizer
     tag_map = defaultdict(lambda: wn.NOUN)
     tag_map['J'] = wn.ADJ
     tag_map['V'] = wn.VERB
     tag_map['R'] = wn.ADV
     lmtzr = WordNetLemmatizer()
     for i in range(len(sentences)):
         pos_tokens = pos_tag(word_tokenize(sentences[i]))
         sentences[i] = [
             lmtzr.lemmatize(token, tag_map[tag[0]])
             for (token, tag) in pos_tokens
         ]
     sentences = sentences.map(lambda x: joinData.join(x))
     return sentences
Exemple #2
0
 def __init__(self, word2vec_provider: Word2VecProvider,
              emoji_provider: EmojiProvider):
     self._emoji_provider = emoji_provider
     self._repeat_replacer = RepeatReplacer()
     self._polarity_replacer = PolarityReplacer()
     self._replacement_patterns = NEGATION_REPLACEMENT_PATTERNS
     self._replacement_patterns.extend([
         # remove urls
         (r'((www\.[^\s]+)|(https?://[^\s]+))', ''),
         # remove usernames
         (r'@[^\s]+', ''),
         # remove # from hashtags
         (r'#([^\s]+)', r'\1'),
         # leave only letters
         (r'[^a-zA-Z]+', ' '),
         # remove months
         (r'(\b\d{1,2}\D{0,3})?\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|'
          +
          r'aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|(nov|dec)(?:ember)?)\D?(\d{1,2}(st|nd|rd|th)?)?(([,.\-\/])'
          + r'\D?)?((19[7-9]\d|20\d{2})|\d{2})*', '')
     ])
     self._regexp_replacer = RegexpReplacer(self._replacement_patterns)
     self._stem_replacer = StemReplacer()
     self._word2vec_provider = word2vec_provider
     self._stopwords = stopwords.words('english')
     # drop negation words from stopwords
     self._stopwords.extend(['NEG_' + word for word in self._stopwords])
     self._stopwords.extend(["'nt", "st", "nd", "rd", "th", "rt"])
     self._stopwords.extend(self._emoji_provider.emoji)
Exemple #3
0
def ExtWd_preprocessing(sentence):
    """
    Separate words for a sample (or, a sentence), while pre-processing
    """
    tokens = word_tokenize(sentence)

    stopset = set(stopwords.words('english'))
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    s = nltk.stem.SnowballStemmer('english')
    stemmer = PorterStemmer()
    replacerReplacer = RepeatReplacer()
    words_cleaned = []

    for w in tokens:
        if w.isalpha(
        ) == True:  # 2) check if all the characters in a word is alphabetic
            w = replacerReplacer.replace(w)  # 5) replace repeating characters
            w = w.lower()  # 1) lowercase
            if w not in stopset:  # 3) remove stopwords
                words_cleaned.append(stemmer.stem(
                    lemmatizer.lemmatize(w)))  # 4) stem words
    return words_cleaned
def repeat_replacer_document(document):
	from replacers import RepeatReplacer
	replacer = RepeatReplacer()
	return replacer.replace(document)
Exemple #5
0
from replacers import RegexpReplacer
from replacers import RepeatReplacer
from replacers import AntonymReplacer
from replacers import SpellingReplacer

# from pickle import dump
#
# output = open('t2.pkl', 'wb')
# dump(t2, output, -1)
# output.close()

test = "DO NOT GO THERE !!!\n\n1. I knew it was questionbale when i brought in oil i purchased for them to change out. He said they don't do this, because they like to purchase it. In other words, he needed to mark up the price for the same oil.\n\n2. He told me that our Shocks were blown out and said that we can't drive too far. Normally, when your shocks are blown out, your ride will be like a bouncing ball. I closely monitored my drive and i did not have a bumpy ride that indicated blown out shocks. I took it to two separate mechanics and they tested the car and said if the shocks were bad, the car would bounce up and down. \n\nBasically, the owner lied about the shocks to get me to pay to fix them. \n\n3. One of my light bulbs is going out. I looked up the model # to replace them and i went to autozone to purchase the ones for my car. The owner said that these are the wrong headlights and I needed a more expensive set. Now, mind you- the model's I had were based on Lexus' recommendation. \n\nHe then said that it would cost over $300 dollars to change out the bulbs. The bulbs he recommend was about $80 bucks, which means over 200 of labor. \n\nHe will over exaggerate everything to get you to pay more. \n\n\nBtw, I sent my wife in to see if he would try to run up maintenance. \n\nI would not recommend this place at all. He is not goood."
test = test.lower()

regex_replacer = RegexpReplacer()
repeat_replacer = RepeatReplacer()
spell_replacer = SpellingReplacer()
antonym_replacer = AntonymReplacer()

test = regex_replacer.replace(test)

# test = repeat_replacer.replace(test)
# tokens = antonym_replacer.replace_negations(sentence)
# tokens = repeat_replacer.replace(word)

# print(test)

sentences = nltk.sent_tokenize(test)
# # print(sentences)
stopwords = nltk.corpus.stopwords.words('english')
puncs = set(string.punctuation)
import nltk
from replacers import RepeatReplacer
replacer = RepeatReplacer()
print(replacer.replace('happy'))
Exemple #7
0
    lowercase_words[i] = tokens[i].lower()

#replacing words with regular expressiong, i.e., 'won't' with 'will not'
#start with s, the untokenized text
replacer = RegexpReplacer()
replacedText = replacer.replace(s)
print(replacedText[:1000])

a = "I'm art won't bar can't he isn't you won't and they've but would've and she's while you're good and i'd here I'd"
a = replacer.replace(a)
print(a)

#edit words with repeating characters and then tokenize a test sentence
#will probably use on forum posts
forumPost = 'I just looooooove it. It is ooooooh so fun aaah oooookaaay whateverrrrr'
repReplacer = RepeatReplacer()
forumPostTokenized = word_tokenize(forumPost)

for i in range(0, len(forumPostTokenized)):
    forumPostTokenized[i] = repReplacer.replace(forumPostTokenized[i])

forumPostTokenized = ' '.join(forumPostTokenized)
print("\n\nBefore: ")
print(forumPost)
print("After: ")
print(forumPostTokenized)

#normalization in a different order. Normalize all text before it is tokenized
#first expand contractions
str1 = ''.join(s)
str1 = replacer.replace(str1)
Exemple #8
0
from replacers import AntonymReplacer
from replacers import RegexpReplacer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

#reading the csv file and extracting the column of tweets into a list
csv_file=sys.argv[1]

df=pd.read_csv(csv_file)
saved_column=df['text']
list1=list(saved_column)
#print (list1)

replacer=AntonymReplacer()
rep1=RepeatReplacer()
rep2=RegexpReplacer()

for i in range(0,len(list1)):
    list1[i]=re.sub(r'[^\x00-\x7F]',r' ',list1[i]) #Replacing non-ascii characters with a space
    list1[i]=rep2.replace(list1[i])                 #texts like can't are converted into can not
    list1[i]=list1[i].split()                       #Splitting each sentence into words
    #list1[i]=[w for w in list1[i] if (len(w)>2)]    #String length of a word is more than 2
    list1[i]=replacer.replace_negations(list1[i])   #Replaces the negative words with antonyms

emo={}
f=open('emotions.txt','r')
for line in f:
    line=line.split(',')
    emo[line[0]]=line[1].rstrip()
#print(emo)
import nltk
from replacers import RepeatReplacer
replacer=RepeatReplacer()
print(replacer.replace('lotttt'))
print(replacer.replace('ohhhhh'))
print(replacer.replace('ooohhhhh'))

import nltk
from replacers import RepeatReplacer
replacer = RepeatReplacer()
print(replacer.replace('lotttt'))
print(replacer.replace('ohhhhh'))
print(replacer.replace('ooohhhhh'))
import nltk
from replacers import RepeatReplacer
replacer=RepeatReplacer()
print(replacer.replace('happy'))
Exemple #12
0
class TweetFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec_provider: Word2VecProvider,
                 emoji_provider: EmojiProvider):
        self._emoji_provider = emoji_provider
        self._repeat_replacer = RepeatReplacer()
        self._polarity_replacer = PolarityReplacer()
        self._replacement_patterns = NEGATION_REPLACEMENT_PATTERNS
        self._replacement_patterns.extend([
            # remove urls
            (r'((www\.[^\s]+)|(https?://[^\s]+))', ''),
            # remove usernames
            (r'@[^\s]+', ''),
            # remove # from hashtags
            (r'#([^\s]+)', r'\1'),
            # leave only letters
            (r'[^a-zA-Z]+', ' '),
            # remove months
            (r'(\b\d{1,2}\D{0,3})?\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|'
             +
             r'aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|(nov|dec)(?:ember)?)\D?(\d{1,2}(st|nd|rd|th)?)?(([,.\-\/])'
             + r'\D?)?((19[7-9]\d|20\d{2})|\d{2})*', '')
        ])
        self._regexp_replacer = RegexpReplacer(self._replacement_patterns)
        self._stem_replacer = StemReplacer()
        self._word2vec_provider = word2vec_provider
        self._stopwords = stopwords.words('english')
        # drop negation words from stopwords
        self._stopwords.extend(['NEG_' + word for word in self._stopwords])
        self._stopwords.extend(["'nt", "st", "nd", "rd", "th", "rt"])
        self._stopwords.extend(self._emoji_provider.emoji)

    @classmethod
    def _count_with_func(cls, tweet, func):
        count = 0
        for word in tweet.split(' '):
            if func(word):
                count += 1
        return count

    @classmethod
    def _count_occurrences(cls, tweet, letter):
        count = 0
        for l in tweet:
            if l == letter:
                count += 1
        return count

    @classmethod
    def _count_uppercase_words(cls, tweet):
        return cls._count_with_func(tweet, lambda word: word == word.upper())

    @classmethod
    def count_exclamation(cls, tweet):
        return cls._count_occurrences(tweet, '!')

    @classmethod
    def count_question_marks(cls, tweet):
        return cls._count_occurrences(tweet, '!')

    def count_positive_emoji(self, tweet):
        return self._count_with_func(
            tweet,
            lambda word: self._emoji_provider.is_positive_emoji(word.strip()))

    def count_negative_emoji(self, tweet):
        return self._count_with_func(
            tweet,
            lambda word: self._emoji_provider.is_negative_emoji(word.strip()))

    def clean_tweet(self, tweet):
        tweet = tweet.lower()
        # transform html encoded symbols
        tweet = BeautifulSoup(tweet, 'lxml').get_text()
        tweet = self._regexp_replacer.replace(tweet)
        tweet = word_tokenize(tweet)
        # eg loooove -> love
        tweet = self._repeat_replacer.replace(tweet)
        # replace negations
        tweet = self._stem_replacer.replace(tweet)
        tweet = self._polarity_replacer.mark_negations(tweet)
        return " ".join(
            [word for word in tweet if word not in self._stopwords]).strip()

    def get_avg_word_similarity(self, tweet, main_word):
        current_similarities = set()
        for word in tweet.split(' '):
            sim = self._word2vec_provider.get_similarity(
                main_word, word.lower())
            if sim is not None:
                current_similarities.add(sim)

        if len(current_similarities) == 0:
            return

        if len(current_similarities) == 1:
            return current_similarities.pop()

        # return np.mean(zscore(list(current_similarities)))

        # if len(current_similarities) == 1:
        #    return current_similarities[0 ]
        current_similarities = list(current_similarities)

        max_sim = np.max(current_similarities)
        min_sim = np.min(current_similarities)
        # normalize to <0;1>
        return list(
            np.mean([((sim - min_sim) / (max_sim - min_sim))
                     for sim in current_similarities]))

    def get_word2vec_vector(self, tweet):
        current_word2vec = []
        for word in tweet.split(' '):
            vec = self._word2vec_provider.get_vector(word.lower())
            if vec is not None:
                current_word2vec.append(vec)

        if len(current_word2vec) == 0:
            return np.zeros(200)

        return np.array(current_word2vec).mean(axis=0)

    def fit(self, x, y=None):
        return self

    def transform(self, texts):
        features = np.recarray(shape=(len(texts), ),
                               dtype=[('pos_emoji_count', float),
                                      ('neg_emoji_count', float),
                                      ('uppercase_word_count', float),
                                      ('exclamation_count', float),
                                      ('question_mark_count', float),
                                      ('clean_text', object),
                                      ('word2vec', np.ndarray)])

        for i, text in enumerate(texts):
            features['pos_emoji_count'][i] = self.count_positive_emoji(text)
            features['neg_emoji_count'][i] = self.count_negative_emoji(text)
            features['uppercase_word_count'][i] = self._count_uppercase_words(
                text)
            features['exclamation_count'][i] = self.count_exclamation(text)
            features['question_mark_count'][i] = self.count_question_marks(
                text)
            features['clean_text'][i] = self.clean_tweet(text)
            features['word2vec'][i] = self.get_word2vec_vector(text)

        return features