def prune_emojis_emoticons(string):
     # at tim doesn't work specially when emoticons comes after a weird emoji like O.o which isnt registered
     if "location" in emot.emoji(string).keys() is not None:
         for loc in reversed(emot.emoji(string)['location']):
             string = string[0:loc[0]] + string[loc[1] + 1::]
     #     print(emot.emoticons(string))
     if "location" in emot.emoticons(string):
         for loc in reversed(emot.emoticons(string)['location']):
             string = string[0:loc[0]] + string[loc[1] + 1::]
     return string
def process_emoji(tweets):
    ##stop_words = get_stop_words("en")

    emoji_count = len(emot.emoji(tweets))
    emoticon_count = len(emot.emoticons(tweets))
    '''remove stop-words
    text_token = tokenizer.tokenize(tweets)
    non_stop_token = [word for word in text_token if word not in stop_words]
    non_stop_text = " ".join(non_stop_token)
    '''
    '''remove non-ascii letters'''

    new_string = re.sub(r"[^\w']", " ", tweets)
    new_string = re.sub(r"[\s]+", ' ', new_string)
    new_string = new_string.strip()
    '''Textblob: Spell correction and analysis polarity'''
    text_blob = TextBlob(new_string)
    correct_string = str(text_blob.correct())

    polarity = text_blob.sentiment.polarity
    subjectity = text_blob.sentiment.subjectivity
    emoji_result = emojiClass(correct_string, emoji_count, emoticon_count,
                              polarity, subjectity)

    return emoji_result
Beispiel #3
0
def classByEmoji(text):
    global positive_emojis
    global negative_emojis
    global neutral_emojis

    emojis = emot.emoji(text)
    emoticons = emot.emoticons(text)
    emots = set()
    for map_emoji in emojis:
        emots.add(map_emoji['value'])
    for map_emoji in emoticons:
        emots.add(map_emoji['value'])

    positive_inter = emots.intersection(positive_emojis)
    negative_inter = emots.intersection(negative_emojis)
    neutral_inter = emots.intersection(neutral_emojis)

    if positive_inter:
        if len(negative_inter) == 0 and len(neutral_inter) == 0:
            return 1
    elif negative_inter:
        if len(neutral_inter) == 0:
            return -1
    elif neutral_inter:
        return 0

    return ''
Beispiel #4
0
def test_emo():
    test = "I love it, 👨 :-) 🏁:-) :-)🏁 :-) 🏁 <3"
    print(emo.emoji(test))
    print(emo.emoticons(test))
    print(test[27:30])
    print(test[17])
    return None
Beispiel #5
0
def convert_emojis(old_text):
    smiley = emot.emoji(old_text)
    new_text = old_text
    if smiley['flag']:
        for i in range(0, len(smiley['value'])):
            new_text = re.sub(smiley['value'][i], smiley['mean'][i], old_text)
            old_text = new_text
    return new_text
Beispiel #6
0
def _strip_emojis(text):
    emojis = set([emoji['value'] for emoji in emot.emoji(text)])

    normalized = text
    for emoji in emojis:
        normalized = normalized.replace(emoji, '')

    return normalized
Beispiel #7
0
def count_emoji(file_name):
    emoji_map = defaultdict(float)
    path = 'Dataset\\Tweets\\'
    if file_name.startswith("dataset"):
        with open(path + file_name, 'r', encoding="utf-8") as file_to_read:
            for line in file_to_read:
                for emoji in emot.emoji(line):
                    emoji_map[emoji['value']] += 1
    return emoji_map
Beispiel #8
0
 def is_emo(word: str) -> bool:
     """Use emot to detect if something is an emoticion or emoji."""
     emoji = emot.emoji(word)['flag']
     emoticon = emot.emoticons(word)
     try:
         emoticon = emoticon['flag']
     except TypeError:
         emoticon = emoticon[0]['flag']
     return bool(emoji) + bool(emoticon)
Beispiel #9
0
def extract_emojis_emoticons(text):
    extracted = []
    vals = emot.emoticons(text)
    if len(vals) > 1:
        extracted.extend(vals['value'])

    vals = emot.emoji(text)
    if len(vals) > 1:
        extracted.extend(vals['value'])
    return extracted
def find_emojis(text):
    """ Find and remove emojis in text.
        Return emojis founded and text without emojis.
    """
    emojis = []
    for emoji in emot.emoji(text):
        emojis.append(emoji['value'])
        text = text.replace(emoji['value'], '')

    return text, emojis
Beispiel #11
0
def formate_smiley(txt):
    """
    Utilisent le package emot pour extraire les emoji (inutilisé à l'heure actuelle).
    Entrée : txt, une string
    Sortie : la liste des emoji texte et caractères.
    """
    lst = list(map(lambda x: x["value"], emot.emoji(txt)))
    lst2 = list(map(lambda x: x["value"], emot.emoticons(txt)))
    lst.extend(lst2)
    return lst
Beispiel #12
0
def isEmoji(word):
    x=emot.emoji(word)
    if isinstance(x, list):
        for v in x:
            val=v['flag']
    else:
        val=x['flag']
    
    if(val==False):
        return False
    else:
        return True
Beispiel #13
0
 def detect_emoticons_emojis(self, string):
     emoticons = emot.emoticons(string)
     emojis = emot.emoji(string)
     if len(emoticons) > 0:
         for emoticon in emoticons:
             value = emoticon['value']
             if value != (')' or ':'):
                 self.emoticons.setdefault(value, set()).add(string)
     if len(emojis) > 0:
         for emoji in emojis:
             value = emoji['value']
             if value != (')' or ':'):
                 self.emojis.setdefault(value, set()).add(string)
Beispiel #14
0
    def text_demojis(text, how_replace=""):
        emojis = emot.emoji(text)
        if isinstance(emojis, list):
            emojis = emojis[0]
        if emojis['flag']:
            for index in range(len(emojis["value"])):
                if how_replace == 'mean':
                    source = emojis['value'][index]
                    target = emojis['mean'][index].split(':')[1]
                    text = text.replace(source, target)
                else:
                    text = text.replace(emojis['value'][index], "")

        return text
    def extract_emoji(text):
        try:
            emoticons_list = emot.emoticons(text)['value']

        except TypeError:
            emoticons_list = []

        try:
            emoji_list = emot.emoji(text)['value']

        except TypeError:
            emoji_list = []
        emo_list = emoticons_list + emoji_list

        return emo_list
Beispiel #16
0
def count_emojis(text):
	global skin_tones

	text = text.replace(':', '')
	emojis_counter = {}
	emojis = emot.emoji(text)
	for map_emoji in emojis:
		try:
			if(map_emoji['value'] not in skin_tones):
				emojis_counter[map_emoji['value']] += 1
		except KeyError as e:
			if(map_emoji['value'] not in skin_tones):
				emojis_counter[map_emoji['value']] = 1
	emojis_counter_sorted = sorted(emojis_counter.items(), key=operator.itemgetter(1), reverse=True)
	return emojis_counter_sorted
Beispiel #17
0
def delete_emoji_and_emoticon(line):
    inline_emoji = emot.emoji(line)
    for data_x in inline_emoji:
        if data_x['value'] in line:
            line = line.replace(data_x['value'], ' ')
    for data_pe in pos_emoticons:
        if data_pe in line:
            line = line.replace(data_pe, ' ')
            emoticon_map[sentiment]["pos"] += 1
    for data_ne in neg_emoticons:
        if data_ne in line:
            line = line.replace(data_ne, ' ')
            emoticon_map[sentiment]["neg"] += 1
    for data_o in others:
        if data_o in line:
            line = line.replace(data_o, ' ')
    return line
Beispiel #18
0
def convert_emojis(text):
    def replace_emoji(txt, indexes, replacements):
        for (index, replacement) in zip(indexes, replacements):
            txt[index] = replacement
        return txt

    converted_emojis = emot.emoji(text)
    if converted_emojis["flag"] is True:
        text = [x for x in text]
        idx_emojis = [
            location[0] for location in converted_emojis["location"]
        ]  # Get the first el as it is a single car
        text = replace_emoji(
            text,
            idx_emojis,
            converted_emojis["mean"],
        )
        text = "".join(text)
    return text
Beispiel #19
0
def base_emoji(text, flag):
    '''base_emoji return setiment of the text based on emoji and emoticons in text.

	Args:
		text (str): Setence of paragraph for calculating setiment.
		flag (boolean): True --> It gives 5 criteria 0,1,2,3,4 where 2(Nutral), 4(very positive), 1(very negative)
						False --> Gives probability with 2 floating point accuray between -1(negative) to 1(positive)
	
	Returns:
		__prob_sentiment: If flag = True it will return number(int) between 0 to 4
						  If flag = False it will return nmber(float-2f) between -1 to 1

	'''
    #convert input to string
    text = str(text)
    __temp_emoji = emot.emoji(text)
    __temp_emoti = emot.emoticons(text)
    __pre_final_text = ""

    #Finding emoji and emoticons from text
    if __temp_emoji['flag'] == True:
        for data in __temp_emoji['mean']:
            __pre_final_text = str(__pre_final_text) + str(data) + " "
    try:
        if __temp_emoti['flag'] == True:
            for data in __temp_emoti['mean']:
                __pre_final_text = str(__pre_final_text) + str(data) + " "
    except:
        pass

    if len(__pre_final_text) < 2:
        __pre_final_text = text

    __analysis = TextBlob(__pre_final_text)

    #choosing output formate of sentiment based on flag
    if flag == False:
        __prob_sentiment = round(__analysis.sentiment.polarity, 4)
    else:
        __prob_sentiment = get_solid_setiment(__analysis.sentiment.polarity)

    return __prob_sentiment
Beispiel #20
0
import pandas as pd
import emot
import re

dataset1 = pd.read_csv('dataset1_2.csv', dtype={'tweet_id': object})

#Detect emoticons - presence and counts
detect_emojis_and_emoticons = lambda x: len(emot.emoji(x)) + len(
    emot.emoticons(x))
dataset1['count_of_emoticons'] = dataset1.text.apply(
    detect_emojis_and_emoticons)

#Detect and clean non-unicode characters
clean_non_unicode = lambda x: x.replace(r'[^\x00-\x7F]+', ' ').strip().lower()
dataset1['cleaned_text'] = dataset1.text.apply(clean_non_unicode)

#Detect and clean links
clean_links = lambda x: re.sub(r'http\S+', ' ', x).strip()
dataset1['cleaned_text'] = dataset1.cleaned_text.apply(clean_links)

#Clean hashtags
hashtags = [
    '#drunk', '#imdrunk', '#drank', '#sober', '#notdrunk', '#imnotdrunk'
]
hashtags_regex = re.compile('|'.join(map(re.escape, hashtags)))
clean_hashtags = lambda x: hashtags_regex.sub(' ', x)
dataset1['cleaned_text'] = dataset1.cleaned_text.apply(clean_hashtags)

#Remove punctuations and extra whitespaces
clean_punctuations = lambda x: re.sub(r'[^\w\s]', ' ', x)
dataset1['cleaned_text'] = dataset1.cleaned_text.apply(clean_punctuations)
def produceWordEmbd(rawTweet):
    tweet = rawTweet

    # print(tweet)

    # Removing twitter handles' tags
    tweet = re.sub(r"@{1}[A-Za-z0-9_]+\s", ' ', tweet)

    # Removing web addresses
    tweet = re.sub(r"htt(p|ps)\S+", " ", tweet)

    # Removing email addresses
    emails = r'[a-zA-Z0-9._%-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}'
    tweet = re.sub(emails, " ", tweet)

    #Getting all emoticons together
    emojis_dict = emot.emoji(tweet)
    emojis = []
    for z in emojis_dict:
        emojis.append(z['value'])
        tweet = re.sub(z['value'], '', tweet)
    # print(tweet, emojis)
    # Tokenizing based on whitespaces
    tokens = word_tokenize(tweet)
    # print(tokens)

    # Getting hashtags intact
    newTokens = []
    for i, x in enumerate(tokens):
        if x == '#' and i < len(tokens) - 1:
            y = x + tokens[i + 1]
            newTokens.append(y)
        else:
            if i > 0:
                if (tokens[i - 1] != '#'):
                    newTokens.append(x)
            else:
                newTokens.append(x)

    # Getting clitics intact
    finalTokens = []
    for j, x in enumerate(newTokens):
        S = ["'s", "'re", "'ve", "'d", "'m", "'em", "'ll", "n't"]
        if x in S:
            y = newTokens[j - 1] + x
            finalTokens.append(y)
        else:
            if j < len(newTokens) - 1:
                if newTokens[j + 1] not in S:
                    finalTokens.append(x)
            else:
                finalTokens.append(x)

    # Eliminate case sensitivity
    for i, z in enumerate(finalTokens):
        finalTokens[i] = z.lower()

    # Getting rid of stopwords
    stopwordSet = set(stopwords.words('english'))
    filteredFinalTokens = []
    for i, z in enumerate(finalTokens):
        if z not in stopwordSet:
            filteredFinalTokens.append(z)

    for x in filteredFinalTokens:
        u = re.split(r"\\n", x)
        for m in u:
            vocabulary.append(m)
    # print(filteredFinalTokens)

    words = filteredFinalTokens
    word_vecs = []
    for word in words:
        fr = np.zeros(400)
        if word in w2v.vocab:
            tr = w2v[word]
            for k in range(400):
                fr[k] = tr[k]
            word_vecs.append(fr)

    for emoji in emojis:
        yr = np.zeros(400)
        if emoji in e2v.vocab:
            zr = e2v[emoji]
            for k in range(300):
                yr[k] = zr[k]
            word_vecs.append(yr)

    return sum(word_vecs) / (len(word_vecs) + 1)
    pass
Beispiel #22
0
def collectFeaturesFromTimeline(filename):
    user_feature = defaultdict(dict)
    tknzr = TweetTokenizer()
    tknzr_reduce = TweetTokenizer(strip_handles=True, reduce_len=True)
    stop_words = set(stopwords.words('english'))
    with gzip.open(filename, 'r') as inf:
        line_count = 0
        num_contr = 0
        num_tokens = 0
        # key: emoticon text, value: occurrence
        emoticon_use = defaultdict(int)
        emoji_use = defaultdict(int)
        num_ht = 0
        num_with_ht_tweets = 0
        all_tokens_set = set()
        reduced_all_tokens_set = set()
        tweet_text = []
        pos_bigrams = defaultdict(int)
        pos_trigrams = defaultdict(int)
        hashtags_occur = defaultdict(int)

        for line in inf:
            line_count += 1
            tweet = json.loads(line.decode('utf8'))
            userid_str = tweet['user']['id_str']

            # filtering section
            # English and non-retweet
            if tweet['lang'] != 'en':
                continue
            if 'retweeted_status' in tweet.keys():
                continue

            # note: need to get untruncated tweets
            text = tweet['full_text']
            if text == '':
                continue
            tweet_text.append(text)

            # Tokenize
            # tokenize has preserve_case option default as True
            tokens = tknzr.tokenize(text)
            num_tokens += len(tokens)
            all_tokens_set.update(set(tokens))

            if len(tweet['entities']['hashtags']) is not 0:
                num_with_ht_tweets += 1
                for ht in tweet['entities']['hashtags']:
                    hashtags_occur[ht['text']] += 1
            num_ht += len(tweet['entities']['hashtags'])

            # lexical diversity number of tokens in a tweet without URLs, user mention and stopwords divided by the total number of tokens
            text_without_url = re.sub(r"http\S+", "", text)
            reduced_tokens = tknzr_reduce.tokenize(text_without_url)
            # eliminate :
            if reduced_tokens[0] == ':':
                reduced_tokens.pop(0)
            reduced_tokens_set = set(reduced_tokens)

            for sw in intersectionTokens(reduced_tokens_set, stop_words):
                # note: remove could raise keyerror
                reduced_tokens_set.remove(sw)

            # order doesn't matter here
            reduced_all_tokens_set.update(reduced_tokens_set)

            # lots of :/ with URL in the text
            emoticons_text = emot.emoticons(text_without_url)
            if type(emoticons_text) == dict:
                for ele in emoticons_text['value']:
                    emoticon_use[ele] += 1
            emoji_text = emot.emoji(text)
            for ele in emoji_text['value']:
                emoji_use[ele] += 1

            # features need to loop over all the tokens
            # syntactic features and contractions
            # pos tagging
            tags = nltk.pos_tag(tokens)
            for i in range(len(tokens)):
                if tags[i][0] in contractions:
                    num_contr += 1
                # For noun's-like contractions
                if tags[i][1] == 'NN' or tags[i][1] == 'NNP':
                    if len(tags[i][0]) <= 3:
                        continue
                    if tags[i][0][-2:] == "'s" or tags[i][0][-3:] == "'re":
                        if i + 1 < len(tokens):
                            if tags[i + 1][1] == 'VBN' and tags[i +
                                                                1][1] == 'VBG':
                                num_contr += 1

                # not including BOS
                if i < len(tokens) - 1:
                    pos_bigrams[(tags[i][1], tags[i + 1][1])] += 1
                if i < len(tokens) - 2:
                    pos_trigrams[(tags[i][1], tags[i + 1][1],
                                  tags[i + 2][1])] += 1

    # line_count is num of Tweets Got
    if line_count == 0 or num_tokens == 0:
        return {}
    user_feature[userid_str]['scraped_tweets_num'] = line_count

    # NOTE: store the user information for behavioral analysis, store the last tweet per user
    with open(os.path.join(outdir, "{}_user_unique.json".format(group)),
              'a',
              encoding='utf8') as uniquef:
        uniquef.write("{}\n".format(json.dumps(tweet)))

    # Collect features here
    # Type-Token Ratio
    user_feature[userid_str]['type_token_ratio'] = len(
        all_tokens_set) / num_tokens

    # Usage of Contractions
    user_feature[userid_str][
        'num_contractions_per_tweet'] = num_contr / line_count

    # lexical diversity
    user_feature[userid_str]['lexical_diversity'] = len(
        reduced_all_tokens_set) / len(all_tokens_set)

    # hashtags: hashtag occurrence and use by num of users
    user_feature[userid_str]['hashtags_occur'] = hashtags_occur
    hashtags_user = {}
    for key in hashtags_occur.keys():
        hashtags_user[key] = 1
    user_feature[userid_str]['hashtags_user'] = hashtags_user
    user_feature[userid_str]['num_ht_per_tweet'] = num_ht / line_count

    # emoticons and emoji
    user_feature[userid_str]['emoticon_use'] = emoticon_use
    user_feature[userid_str]['emojis'] = emoji_use

    # syntactic features
    user_feature[userid_str]['pos_bigrams'] = pos_bigrams
    user_feature[userid_str]['pos_trigrams'] = pos_trigrams

    # store the text for intangible feature use
    if os.path.exists(
            os.path.join(outdir, 'tmp/{}/{}.txt'.format(group, userid_str))):
        return user_feature
    with open(os.path.join(outdir, 'tmp/{}/{}.txt'.format(group, userid_str)),
              'w',
              encoding='utf8') as outf:
        for tt in tweet_text:
            # NOTE: replace \n with space in text
            tt = tt.replace("\n", ".")
            assert "\n" not in tt
            outf.write('{}\n'.format(tt))

    return user_feature
Beispiel #23
0
def produceWordEmbd(rawTweet):
	tweet = rawTweet

	print(tweet)

	# Removing twitter handles' tags
	tweet = re.sub(r"@{1}[A-Za-z0-9_]+\s", ' ', tweet)

	# Removing web addresses
	tweet = re.sub(r"htt(p|ps)\S+", " ", tweet)

	# Removing email addresses
	emails = r'[a-zA-Z0-9._%-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}'
	tweet = re.sub(emails, " ", tweet)

	# Tokenizing based on whitespaces
	tokens = word_tokenize(tweet)
	print(tokens)

	# Getting hashtags intact
	newTokens = []
	for i,x in enumerate(tokens):
		if x == '#' and i < len(tokens)-1:
			y = x + tokens[i+1]
			newTokens.append(y)
		else:
			if i>0:
				if (tokens[i-1]!='#'):
					newTokens.append(x)
			else:
				newTokens.append(x)

	# Getting clitics intact
	finalTokens = []
	for j,x in enumerate(newTokens):
		S = ["'s", "'re", "'ve", "'d", "'m", "'em", "'ll", "n't"]
		if x in S:
			y = newTokens[j-1] + x
			finalTokens.append(y)
		else:
			if j<len(newTokens)-1:
				if newTokens[j+1] not in S:
					finalTokens.append(x)
			else:
				finalTokens.append(x)

	# Eliminate case sensitivity
	for i,z in enumerate(finalTokens):
		finalTokens[i] = z.lower()

	# Getting rid of stopwords
	stopwordSet = set(stopwords.words('english'))
	filteredFinalTokens = []
	for i,z in enumerate(finalTokens):
		if z not in stopwordSet:
			filteredFinalTokens.append(z)

	print(filteredFinalTokens)

	# Treating emojis
	word_vecs = []
	for j,y in enumerate(filteredFinalTokens):
		z = emot.emoji(y)
		if z == []:
			if y in w2v.vocab:
				print("Adding word vector for " + y)
				word_vecs.append(w2v[y])
		else:
			w = re.findall(r"[A-Za-z0-9]+", y)
			s1 = np.ndarray(shape=(300,1), dtype=float)
			s2 = np.ndarray(shape=(400,1), dtype=float)
			if w != []:
				w = w[0]
				t = re.sub(w,'',y)
				#s1 = np.ndarray(300, 1)
				#s2 = np.zeros(400, 1)
				if t in e2v.vocab:
					print("Computing emoji vector for " + t)
					s1 = e2v[t]
					print(type(s1))
					if w in w2v.vocab:
						print("Computing vector for word-part " + w)
						s2 = w2v[w]
					#s1 = [s1, np.zeros(100,1)]
					
				N = 100
				s1 = np.pad(s1, (0, N), 'constant')
				word_vecs.append(s1 + s2)
	pass
 def n_total_emojis(string):
     if any("value" in d for d in emot.emoji(string)):
         return len(emot.emoji(string)["value"])
     else:
         return 0
# to lower case

amazon_file['review'] = amazon_file['review'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))
amazon_file['review'].head()

# exract emoji

import emot

amazon_list = amazon_file['review']
l = []
for i in range(len(amazon_list)):
    #print (emot.emoji(reviewlist[i]).get("value","none"))
    x = emot.emoji(amazon_list[i]).get("mean", "none")
    l.append(x)

amazon_file['emoji'] = l

emoji_file1 = pd.read_csv('C:\\Users\\gupta\\.spyder-py3\\amazon_emotions.csv')
emoji_file1['emoji'] = emoji_file1['emoji'].str.replace('[^\w\s]', '')
emoji_file1['emoji'].tail()
# punctuation removal

amazon_file['review'] = amazon_file['review'].str.replace('[^\w\s]', '')
amazon_file['review'].tail()

# stop words removal

import nltk
    pass


for x in tqdm.trange(len(data)):
    s = data[x][0]
    for i in tqdm.trange(len(s)):
        print("Word embedding for ", i)
        produceWordEmbd(s[i])

L = len(vocabulary)

wordVectors = []

for l in tqdm.trange(L):
    y = vocabulary[l]
    z = emot.emoji(y)
    if z == []:
        if y in w2v.vocab:
            # print("Adding word vector for " + y)
            wordVectors.append(w2v[y])
    else:
        w = re.findall(r"[A-Za-z0-9]+", y)
        s1 = np.ndarray(shape=(300, 1), dtype=float)
        s2 = np.ndarray(shape=(400, 1), dtype=float)
        if w != []:
            w = w[0]
            t = re.sub(w, '', y)
            #s1 = np.ndarray(300, 1)
            #s2 = np.zeros(400, 1)
            if t in e2v.vocab:
                # print("Computing emoji vector for " + t)
# to lower case

spotify_file['Review'] = spotify_file['Review'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))
spotify_file['Review'].head()

# exract emoji

import emot

spotify_list = spotify_file['Review']
l = []
for i in range(len(spotify_list)):
    #print (emot.emoji(reviewlist[i]).get("value","none"))
    x = emot.emoji(spotify_list[i]).get("mean", "none")
    l.append(x)

spotify_file['emoji'] = l

# punctuation removal

spotify_file['Review'] = spotify_file['Review'].str.replace('[^\w\s]', '')
spotify_file['Review'].tail()

# stop words removal

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
Beispiel #28
0
def process_title(word=''):
    # punctuation_regex = r"[()]?[.,!?;~:]+"
    specials = [r':-?[\)\(]+']

    word = word.lower()

    others = []
    for o in specials:
        m = re.search(o, word)
        if m is not None:
            others.append(m.group(0))
            word = word.replace(m.group(0), '')

    emoji = emot.emoji(word)
    #  detect skin code
    #     to_delete = []
    #     for i, emo in enumerate(emoji):
    #         if u"\U0001F3FB" <= emo['value'] <= u"\U0001F3FF":
    #             emoji[i-1]['value'] += emo['value']
    #             to_delete.append(i)
    #     for i in reversed(to_delete):
    #         del emoji[i]

    if not re.search('[a-zA-Z]', word):
        #  just manage emoji
        emos = list(map(lambda emo: emo['value'], emoji))
        for emo in emos:
            word = word.replace(emo, '')
        return ' '.join(emos) + ' ' + word

    emoticons = emot.emoticons(word)

    # skip all-words emoticons, normally wrong
    emoticons = list(filter(lambda emo: re.search('[^a-z]', emo['value']), emoticons))

    # merge single-char emoticons, normally wrong
    previous = -2
    to_delete = []
    for i, emo in enumerate(emoticons):
        if len(emo['value']) < 2 and emo['location'][0] == previous - 1:
            emoticons[i - 1]['value'] += emo['value']
            to_delete.append(i)
        previous = emo['location'][1]
    for i in reversed(to_delete):
        del emoticons[i]
        # remove remaining single-char emoticons
    emoticons = list(filter(lambda emo: len(emo['value']) >= 2, emoticons))

    emos = list(map(lambda emo: emo['value'], emoji + emoticons))
    for emo in emos:
        word = word.replace(emo, '')

    # punctuation
    #     punctuation = re.findall(punctuation_regex, word)
    #     for p in punctuation:
    #         word = word.replace(p, ' ')

    #     # parentesis
    #     word = re.sub(r'[\(\)]', '', word)

    # multiple spaces
    word = word.replace('  ', ' ').strip()

    # separated letters (i.e. 'w o r k o u t' or 'r & b')
    if re.match(r'^([\w&] )+[\w&]$', word):
        word = word.replace(' ', '')

    # hashtag
    word = re.sub(r'^#', '', word)

    # #remove stopwords
    # stop_words = stopwords.words('english')
    # ' '.join([w for w in word.split(' ') if w not in stop_words])

    # remove spaces
    word_no_spaces = word.replace(' ', '')

    # if(len(punctuation)>=1):
    #       print(punctuation)
    return ' '.join(emos + others) + ' ' + word + ' ' + word_no_spaces
spotify_file= pd.read_csv('spotify3.csv')
print(spotify_file)

# to lower case

spotify_file['Review']= spotify_file['Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
spotify_file['Review'].head()

# exract emoji 
# to be taken later
import emot

spotify_list =spotify_file['Review']
l=[]
for i in range(len(spotify_list)):
    x= emot.emoji(spotify_list[i]).get("value","none")
    l.append(x)   
    
spotify_file['emoji']=l

# punctuation removal
#to be not done
spotify_file['Review'] = spotify_file['Review'].str.replace('[^\w\s]','')
spotify_file['Review'].tail()

# stop words removal

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
Beispiel #30
0
 def place_emoji(self, insert_emoji):
     emoji = emot.emoji(insert_emoji)
     return emoji