Esempio n. 1
0
def emoji_to_text(text):
    demoji.findall(text)
    text = demoji.findall(text)
    temp_list = demoji_text.values()
    temp_list = demoji_text.values()
    no_emoji_df = pd.DataFrame(temp_list, columns=['emoji_Text'])
    print(no_emoji_df)
    no_emoji_df.to_csv(
        r'C:\Users\heromero\Desktop\Stefanini\Desarollo_Redes_sociales\Nueva carpeta\IGphotoDownloader\emoji_to_text.csv'
    )
Esempio n. 2
0
    def get_stats_frame(self):
        raw_data = self._get_raw_data()
        USER = []
        TOTAL_MSG = []
        TOTAL_MEDIA = []
        TOTAL_LINK = []
        """
        This User profiles, will search for each of the unique users and 
        will map all their messages and all sorts of the data, and will 
        this dict will return all the different dataframes with keys is the unique 
        users.
        """
        USER_PROFILES = {
            user: raw_data[raw_data.User == user]
            for user in raw_data.User.unique()
        }

        for user in USER_PROFILES.keys():
            total_msg_with_media = USER_PROFILES[user].shape[0]
            media = list(USER_PROFILES[user].loc[
                USER_PROFILES[user].Msg.str.contains('<Media omitted>'),
                'Msg'].index)
            link = list(USER_PROFILES[user].loc[
                USER_PROFILES[user].Msg.str.contains('https'), 'Msg'].index)

            USER.append(user)
            TOTAL_MSG.append(total_msg_with_media - len(media))
            TOTAL_MEDIA.append(len(media))
            TOTAL_LINK.append(len(link))

        # Getting the emojies to append too with the user
        NAME = []
        EMOJIES = []
        EMOJIES_LEN = []
        for user in USER_PROFILES.keys():
            NAME.append(user)
            EMOJIES.append(
                list(demoji.findall(str(USER_PROFILES[user].Msg)).keys()))
            EMOJIES_LEN.append(
                len(list(demoji.findall(str(USER_PROFILES[user].Msg)).keys())))

        Stat_data = {
            "User": USER,
            "Total_msg": TOTAL_MSG,
            "Total_media": TOTAL_MEDIA,
            "Total_link": TOTAL_LINK,
            "Emojies": EMOJIES,
            "Total_emojies": EMOJIES_LEN
        }

        Stat_data_frame = pd.DataFrame(Stat_data)
        return Stat_data_frame
def rep_emojis(t):
    d = demoji.findall(t)
    if d:
        for k, v in d.items():
            t = t.replace(k, ' xxemoji ')
            #t=t.replace(k,' xxemoji xx'+v.replace(' ','_')+' ')
    return t
def preProcessing(text):
    text = text.lower()

    lemmatizer = WordNetLemmatizer()
    words = text.split(' ')

    stopSet = stopwords.words('english')
    text = " ".join([i for i in words if i not in stopSet])

    #replace emoji
    words = text.split(' ')
    idx = 0
    for word in words:
        emojiDict = demoji.findall(word)
        emojiText = list(emojiDict.keys())
        if len(emojiText) > 0:
            words[idx] = emojiDict[emojiText[0]]
        idx += 1

    text = ''
    for w in words:
        nWord = w.replace('n\'t', 'not')
        text += ' ' + lemmatizer.lemmatize(nWord)

    #remove non-ascii characters
    text = unidecode(text)

    seperators = list(string.punctuation)
    for i in seperators:
        text = text.replace(i, ' ')

    #print(text)
    return text
def emoji2text(connection, df_tweets):
    """ A function that replaces emoji with corresponding text inside the tweets"""

    cursor = connection.cursor()
    for i in range(len(df_tweets)):
        id = df_tweets.loc[i, 'ID']

        content = df_tweets.loc[i, 'TRCONTENT'] if df_tweets.loc[
            i, 'TRCONTENT'] else df_tweets.loc[i, 'CONTENT']

        #find the emojis inside the text, function returns emojis and corresponding text value
        emoDict = demoji.findall(content)
        convertedText = df_tweets.loc[i, 'FCONTENT']
        #replace all emojis with corresponding text
        if len(emoDict):
            print('ORIGINAL: ', content)
            for emo, emoText in emoDict.items():
                emoText = ' ' + emoText + ' '  #leading and panding spaces to separate emoji from other words
                convertedText = convertedText.replace(emo, emoText)
            df_tweets.loc[i, 'FCONTENT'] = convertedText
            print('CONVERTED: ', df_tweets.loc[i, 'FCONTENT'])

            query = 'UPDATE GoldenSet SET FCONTENT = "' + str(
                df_tweets.at[i, 'FCONTENT']) + '" WHERE ID = ' + str(id) + ";"
            cursor.execute(query)
            print('-' * 40 + '\n')

    cursor.close()
    print(
        "\n\n\n----------------------- EMOJI TO TEXT FINISHED -------------------------\n\n\n"
    )
Esempio n. 6
0
def demojify(input_text):
    text_emojis = {
        ':-*\)': 'smile',
        ':-*]': 'smile',
        ':-*d': 'smile',
        ':-*\(': 'frown',
        ':-*\[': 'frown',
        ':-*/': 'unsure',
        ':-*o': 'astonish',
        ':-*0': 'astonish',
        'xd': 'laugh',
        ';-*\)': 'wink',
        ":'\(": 'cry',
        ':3': 'smile',
        '&lt;3': 'love',
    }
    # Find all icon emojis
    icon_emojis = demoji.findall(input_text)
    emojis = {**text_emojis, **icon_emojis}

    for emoji, emoji_text in emojis.items():
        # Add extra space to avoid combining the text with the next word.
        # Extra space is removed later.
        input_text = re.sub(emoji, f' {emoji_text} ', input_text)

    return input_text
Esempio n. 7
0
def df_punct(df, emoji='on'):

    # Import necessary libraries
    import re
    import demoji
    # demoji.download_codes()

    # Reset index
    df = df.reset_index(drop=True)

    # Delete all urls from the strings, which are almost solely used to retweet
    df['text'] = [re.sub(r'http\S+', "", txt) for txt in df['text']]

    # Locate retweets and assign a dummy variable to them
    df['rt'] = [1 if 'RT @' in txt else 0 for txt in df['text']]

    # Replace all emojis with word representations
    if emoji == 'on':
        big_str = ' '.join(df['text'])
        emj_dct = demoji.findall(big_str)
        for emoji in emj_dct:
            df['text'] = df['text'].str.replace(emoji, emj_dct[emoji])
    elif emoji == 'off':
        pass

    # Delete from the text strings 'rt' which indicates a Retweet
    df['text'] = [re.sub(r'rt', "", txt) for txt in df['text']]

    # Delete punctuation
    df['text'] = [
        re.sub(r'[^\w\s]', '',
               str(txt).lower().strip()) for txt in df['text']
    ]

    return df
Esempio n. 8
0
def text_process(tweet):
    #removing mentions
    tweet = re.sub(r'@[A-Za-z0-9]+','' ,tweet, flags=re.MULTILINE)
    #removing url links
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    #removing numbers
    tweet = ''.join([i for i in tweet if not i.isdigit()])
    #converting some words to not
    tweet=re.sub(r"\bdidn't\b","not",tweet.lower())
    tweet=re.sub(r"\bdoesn't\b","not",tweet.lower())
    tweet=re.sub(r"\bdon't\b","not",tweet.lower())

    #converting emojis to their meaning
    #demoji.download_codes()
    l=demoji.findall(tweet)
    for key, value in l.items():
        tweet = tweet.replace(key, value)
    #removing puctuations
    nopunc = [char for char in tweet if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    #seperating words
    nopunc=' '.join(segment(nopunc))
    #returning the tweet without the stopwords
    tokens = [word for word in nopunc.split() if word.lower() not in sw]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
Esempio n. 9
0
def _process_unicode_emojis(m, emoji_class):
    e = m.group()
    title = UNICODE_LIST.get(e, demoji.findall(e)[e])
    if '\u200d' not in e:  # If there isn't a zero width joiner, strip out variation selectors
        e = re.sub(r'[\U0000FE00-\U0000FE0F]$', '', e)
    codepoint = "-".join(['%04x' % ord(_c) for _c in e]).lstrip('0')
    return f'<img class="{emoji_class}" title=":{title}:" ' \
        f'src="https://twemoji.maxcdn.com/2/svg/{codepoint}.svg" alt="{e}">'
Esempio n. 10
0
def find_emojis(tweet_list):
    emoji_count = 0
    for tweet in tweet_list:
        ems = demoji.findall(tweet)
        if len(ems):
            print(ems)
            emoji_count += 1

    print(f"Found {emoji_count} tweets with emojis.")
def get_stats_frame(raw_data):
    USER = []
    TOTAL_MSG = []
    TOTAL_MEDIA = []
    TOTAL_LINK = []

    users = {
        user: raw_data[raw_data.User == user]
        for user in raw_data.User.unique()
    }

    for user in users.keys():
        total_msg_with_media = users[user].shape[0]
        media = list(
            users[user].loc[users[user].Msg.str.contains('<Media omitted>'),
                            'Msg'].index)
        link = list(users[user].loc[users[user].Msg.str.contains('https'),
                                    'Msg'].index)
        USER.append(user)
        TOTAL_MSG.append(total_msg_with_media - len(media))
        TOTAL_MEDIA.append(len(media))
        TOTAL_LINK.append(len(link))

    # Getting the emojies to append too with the user
    NAME = []
    EMOJIES = []
    EMOJIES_LEN = []
    for user in users.keys():
        NAME.append(user)
        EMOJIES.append(list(demoji.findall(str(users[user].Msg)).keys()))
        EMOJIES_LEN.append(
            len(list(demoji.findall(str(users[user].Msg)).keys())))

    Stat_data = {
        "User": USER,
        "Total_msg": TOTAL_MSG,
        "Total_media": TOTAL_MEDIA,
        "Total_link": TOTAL_LINK,
        "Emojies": EMOJIES,
        "Total_emojies": EMOJIES_LEN
    }

    Stat_data_frame = pd.DataFrame(Stat_data)
    return Stat_data_frame
Esempio n. 12
0
def preprocess_cleaning(df):
    '''
    Convert non-ascii to ascii
    Count URL, emoji, punc, hashtag, mentions
    convert emoji to text
    convert hashtag using camel case
    lower text
    '''

    stop_words = stopwords.words('english')
    EMOJI_TO_TEXT = demoji.findall((' ').join(df['tweet_text'].to_list()))
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

    def lemmatize_words(text):
        pos_tagged_text = nltk.pos_tag(text.split())
        return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

    def clean_text(text):
        '''Make text lowercase, remove text in square brackets, remove links, remove user mention,
        remove punctuation, remove numbers and remove words containing numbers.'''
        
        text = re.sub('(#[A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', text))  # Split by camel case
        text = text.lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('@\w+', '', text) # mentions
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punc
        text = re.sub('\n', '', text)
        text = re.sub(r'(.)\1+', r'\1\1', text) # char repeated more than twice. ex hellllp -> hellp
        
        return text

    def emoji_to_text(text):
        return ' '.join([EMOJI_TO_TEXT.get(i, i) for i in text.split(' ')])

    df['num_url']=df['tweet_text'].apply(lambda x:x.count('URL'))
    df['num_user_id']=df['tweet_text'].apply(lambda x:x.count('USERID'))
    df['num_emoji'] = df['tweet_text'].apply(lambda x:len([i for i in x if i in EMOJI_TO_TEXT]))
    
    df['tweet_text']=df['tweet_text'].apply(lambda x:emoji_to_text(x))
    df['tweet_text']=df['tweet_text'].apply(lambda x:unidecode(x))
    df['tweet_text']=df['tweet_text'].apply(lambda x:lemmatize_words(x))
    
    df['has_url']=(df['num_url']>0).astype(int)
    df['has_emoji']=(df['num_emoji']>0).astype(int)
    df['num_hashtags'] = df['tweet_text'].str.findall(r'#(\w+)').apply(lambda x : len(x))
    df['num_user_mention'] = df['tweet_text'].str.findall(r'@(\w+)').apply(lambda x : len(x))
    df['num_punctuation'] = df['tweet_text'].str.replace(r'[\w\s#]+', '').apply(lambda x : len(x))
    

    df['text_cleaned'] = df['tweet_text'].apply(clean_text)
    # Remove stop words
    df['text_cleaned'] = df['text_cleaned'].str.split().apply(lambda x: [word for word in x if word not in stop_words]).apply(lambda x: ' '.join(x))
    
    return df
Esempio n. 13
0
def contains_emoji(text):
    """Return True if text contains an emoji, False otherwise.

    Examples:
    >>> contains_emoji("­Ъда coronavirus. ­Ъцњ­Ъўи­Ъўи­Ъўи")
    True
    >>> contains_emoji("No emojis here. :( )")
    False
    """
    return bool(demoji.findall(text))
def preprocessing(i, file, userList):
    tweet_dic = json.loads(i)
    #date
    date = time.strftime(
        '%Y-%m-%dT%H:00:00Z',
        time.strptime(tweet_dic["created_at"], "%a %b %d %H:%M:%S +0000 %Y"))
    format_str = '%Y-%m-%dT%H:00:00Z'
    dt = datetime.strptime(date, format_str)
    final = dt + timedelta(hours=1)
    #reply_text, poi_id, poi_name
    tweet_dic['tweet_date'] = final.strftime(format_str)
    if ('full_text' in tweet_dic.keys()):
        print("tweet has full_text")
    elif ('text' in tweet_dic.keys()):
        print("has text instead of full_text")
        tweet_dic['full_text'] = tweet_dic['text']
    else:
        print("No full_text or text")
        return
    if tweet_dic['in_reply_to_status_id'] is not None:
        #print("wkefnwlm")
        if tweet_dic['in_reply_to_screen_name'] not in userList:
            tweet_dic['poi_name'] = tweet_dic['user']['screen_name']
            tweet_dic['poi_id'] = tweet_dic['user']['id']
        else:
            tweet_dic['poi_name'] = tweet_dic['in_reply_to_screen_name']
            tweet_dic['poi_id'] = tweet_dic['in_reply_to_user_id']
        tweet_dic['reply_text'] = tweet_dic['full_text']
    else:
        #print("welfnwelknfdwm")
        #print(tweet_dic)
        tweet_dic['poi_name'] = tweet_dic['user']['screen_name']
        tweet_dic['poi_id'] = tweet_dic['user']['id']
        tweet_dic['reply_text'] = None
    print(tweet_dic['poi_name'])
    #country
    screen_name = tweet_dic['poi_name']
    tweet_dic['country'] = "India"
    if screen_name in india_list:
        tweet_dic['country'] = "India"
    elif screen_name in usa_list:
        tweet_dic['country'] = 'USA'
    elif screen_name in brazil_list:
        tweet_dic['country'] = 'brazil'
    else:
        print("error poi {}".format(tweet_dic['id']))
    #text_xx
    #text_xx = "text_" + str(tweet_dic['lang'])
    full_text = tweet_dic['full_text']
    tweet_dic['text_copy'] = demoji.replace(full_text)
    tweet_dic['tweet_emotions'] = list(demoji.findall(full_text).keys())
    #time.sleep(1)
    json.dump(tweet_dic, file, ensure_ascii=False)
    #time.sleep(5)
    file.write("\n")
def remove_emoji(transient_tweet_text):
    tweet_tokenizer = TweetTokenizer()
    tokenized_tweet = tweet_tokenizer.tokenize(transient_tweet_text)
    emojis_present = demoji.findall(transient_tweet_text)
    tweet_no_emoji=''
    for i,s in enumerate(tokenized_tweet):
        if s in emojis_present.keys():
            tweet_no_emoji = tweet_no_emoji + ' ' + emojis_present[s]
        else:
            tweet_no_emoji = tweet_no_emoji + ' ' + s
    return tweet_no_emoji
Esempio n. 16
0
def rep_emojis(message):
    butter = []
    bandit = demoji.findall(message)
    no = [char for char in message]
    for word in no:
        if word in bandit.keys():
            butter.append(' ')
            butter.append(word.upper().replace(word, bandit[word]))
        else:
            butter.append(word)
    return ''.join(butter)
Esempio n. 17
0
def emojiReplace_v2(text_string):
    emoji_dict = demoji.findall(text_string)    
    for emoji in emoji_dict.keys():
        #Making the connecting token between words a normal letter 'w' because BERT's tokenizer
        #splits on special tokens like '%' and '$'
        emoji_token = 'x'.join(re.split('\W+', emoji_dict[emoji])) + ' '
        text_string = text_string.replace(emoji, emoji_token)
        
        #Controlling for multiple emojis in a row
        pattern = '(' + emoji_token + ')' + '{2,}'
        text_string = re.sub(pattern, 'mult' + emoji_token + ' ', text_string)
    return text_string
def parse_emoji(sentence: str, sub: bool):
    emojis = demoji.findall(sentence)
    for emoji in emojis:
        if sub:
            desc = emj.demojize(emoji)
            desc = re.sub(r"[:_]", " ", desc).strip()
            sentence = sentence.replace(emoji, f" EMOJI:[{desc}] ")
        else:
            sentence = sentence.replace(emoji, f" {emoji} ")
    sentence = re.sub(r"\s{2,}", " ", sentence)

    return sentence
def cleanwords(sentence):
    sentence_emogis = demoji.findall(sentence)
    sentence_emogis_short = " "
    for value in sentence_emogis.values():
        sentence_emogis_short = sentence_emogis_short + (str(value) + " ")
    sentence = deEmojify(sentence)
    words = word_tokenize(sentence)
    words = [
        lemmatizer.lemmatize(word,
                             pos=get_simple_POS(pos_tag(word)[0][1])).lower()
        for word in words if not word.lower() in stop and not word.isdigit()
    ]
    return " ".join(words)
	def replace_emojis(self):
		"""emojis/emoticons found:  """
		emojis = demoji.findall(self.text)
		if not emojis:
			return self.text, False

		tokenized_text = tokenizer.tokenize(text)
		for i, s in enumerate(tokenized_text):
			if s in emojis.keys():
				tokenized_text[i] = emojis[s]
		self.text = ' '.join(tokenized_text)
		self.text = self.erase_emojis()
		return self.text, list(emojis.keys())
Esempio n. 21
0
def preprocessed(textData):
    allfile = 'lastFinishedPreprocess.csv'
    with open(textData, newline='') as csvfile:
        files = csv.reader(csvfile, delimiter=' ', quotechar='|')
        for text in files:
            newFile = np.char.lower(text)
            #print("lower: ", newFile)

            newFile = remove_stopwords(str(newFile))
            #print("stop: ", newFile)

            newFile = re.sub('http[s]?://\S+', '', str(newFile))
            #print("URL: ", newFile)

            newFile = np.char.replace(newFile, '\\n', '')
            newFile = np.char.replace(newFile, '\\r', '')

            symbols = "!#$%&()*+-"
            for i in symbols:
                newFile = np.char.replace(newFile, i, ' ')
            symbols = "!./:;<=>?@[\]^_`{|}~"
            for i in range(len(symbols)):
                newFile = np.char.replace(newFile, symbols[i], ' ')
            newFile = np.char.replace(newFile, ',', '')
            newFile = np.char.replace(newFile, "  ", " ")
            #print("punc: ", newFile)
            #print (newFile)
            newFile = np.char.replace(str(newFile), "’", "")

            newFile = re.sub(r"\b[a-zA-Z]\b", "", str(newFile))
            '''ps = PorterStemmer()
            newFile = [ps.stem(word) for word in newFile]'''

            lemmatizer = WordNetLemmatizer()
            newFile = word_tokenize(newFile)
            newFile = [lemmatizer.lemmatize(word) for word in newFile]

            newFile = ' '.join([
                num2words.num2words(i) if i.isdigit() else i for i in newFile
            ])

            for i in newFile:
                emojis = demoji.findall(i)
                if i in emojis:
                    newFile = newFile.replace(i, emojis[i])
                #print(row)
            newFile = ''.join(i for i in newFile)

            with open(allfile, 'a', newline='') as csvfile:
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow([newFile])
def replace_emojis(features):
    """

    """
    features_clean = []
    for f in features:
        f_res = demoji.findall(f)
        if len(f_res) > 0:
            for x, y in f_res.items():
                f = f.replace(x, f"<{y}>")
            features_clean.append(f)
        else:
            features_clean.append(f)
    return features_clean
def preprocess(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    emojis = demoji.findall(text)
    cleaned = []
    for token in tokens:
        if 'http' in token:
            cleaned.append('[URL]')
        elif '@' in token:
            cleaned.append('[USER]')
        elif token in emojis:
            cleaned.append(':' + ''.join(emojis[token].split()) + ':')
        else:
            cleaned.append(token.lower())
    return ' '.join(cleaned)
Esempio n. 24
0
    async def poll(self, ctx, *, arg):
        emoji = []

        if 'votes=' in arg:
            return await send_message(
                ctx,
                '''You don't need to do votes= for emoji anymore, I'll pull them automatically.''',
                error=True)

        # emoji = list(re.findall(emojiPattern, arg, flags=re.DOTALL)) + list(re.findall(customEmojiPattern, arg, flags=re.DOTALL))
        emoji = list(demoji.findall(arg).keys()) + list(
            re.findall(customEmojiPattern, arg, flags=re.DOTALL))
        msg = await ctx.send(f"**Poll time! <@{ctx.author.id}> asks:**\n{arg}")
        for reaction in emoji:
            await msg.add_reaction(reaction.strip('<> '))
Esempio n. 25
0
def emoji(textData): #for some reason us flag not being converted
    noEmo = 'tempEmo.csv'
    with open(textData, newline='') as csvfile:
        files = csv.reader(csvfile,delimiter=' ', quotechar='|')
        for text in files:
            for row in text:
                #print(row)
                for i in row:
                    emojis = demoji.findall(i)
                    if i in emojis:
                        row = row.replace(i,emojis[i])
                #print(row)
                row = ' '.join(i for i in row.split())
                with open(noEmo,'a',newline='') as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow([row])
Esempio n. 26
0
def clean_sentence(sentence):
    if re.search(hashtag, sentence) is not None:
        sentence = re.sub(hashtag, r'\1', sentence)
    sentence = re.sub(links, 'URL', sentence)
    reference = demoji.findall(sentence)
    # print(reference)
    emoticons = emot.emoticons(sentence)
    if isinstance(emoticons, list):
        emoticons = emoticons[0]
    # print(emoticons)
    if len(reference) > 0:
        for key, value in reference.items():
            sentence = sentence.replace(key, value+" ")
    if emoticons['flag']:
        for i in range(len(emoticons['value'])):
            # print(emoticons['value'][i])
            sentence = sentence.replace(emoticons['value'][i], extract_emotion(emoticons['mean'][i]))
    return sentence
def get_emoji_list(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE)

    emoticons_dict = demoji.findall(string)
    if not emoticons_dict is null:
        emo_list = []
        for emo in emoticons_dict.keys():
            emo_list.append(emo)
    return emo_list
Esempio n. 28
0
def clean_tweets(tweet, translate=False, tokenize=False):
    '''process tweets:
    - tweet processor to get rid of url links
    - remove punctuations
    - translate to english
    - to lowercase
    - remove common stopwords
    - stem instead of lemmitize (for faster process, although draw back is the stemmed words sometime do not look like a real word)
    - process emojis i.g. :) --> smily face'''
    tweet = str(tweet)  # sometimes not a string type i.e. float
    orig_tweet = copy.deepcopy(tweet)
    # remove urls,  mentions, and hashtags
    try:
        tweet = p.clean(tweet)
    except Exception as e:
        print(e)
    # 1. remove non-letter or space.
    tweet = re.sub('[^[a-z|A-Z|\s]*', '', tweet)
    if translate:
        try:
            # translate to English
            tweet = translate(tweet)
        except json.decoder.JSONDecodeError:
            print(tweet)

    stop = stopwords.words('english')  # common stop words
    stop_words = set(stop)
    # 2. convert to lower case and tokenize
    tweet = tweet.lower()
    tweet_tokens = word_tokenize(tweet)
    # 3. remove stopwords, Stemming
    ret = []
    for word in tweet_tokens:
        if not word in stop_words:
            stemmer = PorterStemmer()
            word = stemmer.stem(word)
            ret.append(word)
    # 4. append emojis
    ret.extend(list(demoji.findall(orig_tweet).values()))
    if not tokenize:
        ret = ' '.join(ret)
    return ret
Esempio n. 29
0
def clean_sentence(sentence):
    """
    replaces all emojis and emoticons with their text equivalent
    :param sentence: str, raw text
    :return: clean text
    """
    reference = demoji.findall(sentence)
    # print(reference)
    emoticons = emot.emoticons(sentence)
    if isinstance(emoticons, list):
        emoticons = emoticons[0]
    # print(emoticons)
    if len(reference) > 0:
        for key, value in reference.items():
            sentence = sentence.replace(key, value+" ")
    if emoticons['flag']:
        for i in range(len(emoticons['value'])):
            # print(emoticons['value'][i])
            sentence = sentence.replace(emoticons['value'][i], extract_emotion(emoticons['mean'][i]))
    return sentence
Esempio n. 30
0
def emoji(cleaned_text, Emoji_emo):
    emoji = demoji.findall(cleaned_text)
    score = pd.DataFrame(np.zeros(shape=(1, 6)).astype(int),
                         columns=[
                             'Happy', 'Excitement', 'Pleasant', 'Surprise',
                             'Fear', 'Angry'
                         ])
    for emo in emoji:
        if Emoji_emo["Happy"].str.contains(emoji[emo]).sum() > 0:
            score["Happy"] += 1
        if Emoji_emo["Excitement"].str.contains(emoji[emo]).sum() > 0:
            score["Excitement"] += 1
        if Emoji_emo["Pleasant"].str.contains(emoji[emo]).sum() > 0:
            score["Pleasant"] += 1
        if Emoji_emo["Surprise"].str.contains(emoji[emo]).sum() > 0:
            score["Surprise"] += 1
        if Emoji_emo["Fear"].str.contains(emoji[emo]).sum() > 0:
            score["Fear"] += 1
        if Emoji_emo["Angry"].str.contains(emoji[emo]).sum() > 0:
            score["Angry"] += 1
    return (score)