Python findall Exemples, demoji.findall Python Exemples

Exemple #1

0

Afficher le fichier

def emoji_to_text(text):
    demoji.findall(text)
    text = demoji.findall(text)
    temp_list = demoji_text.values()
    temp_list = demoji_text.values()
    no_emoji_df = pd.DataFrame(temp_list, columns=['emoji_Text'])
    print(no_emoji_df)
    no_emoji_df.to_csv(
        r'C:\Users\heromero\Desktop\Stefanini\Desarollo_Redes_sociales\Nueva carpeta\IGphotoDownloader\emoji_to_text.csv'
    )

Exemple #2

0

Afficher le fichier

    def get_stats_frame(self):
        raw_data = self._get_raw_data()
        USER = []
        TOTAL_MSG = []
        TOTAL_MEDIA = []
        TOTAL_LINK = []
        """
        This User profiles, will search for each of the unique users and 
        will map all their messages and all sorts of the data, and will 
        this dict will return all the different dataframes with keys is the unique 
        users.
        """
        USER_PROFILES = {
            user: raw_data[raw_data.User == user]
            for user in raw_data.User.unique()
        }

        for user in USER_PROFILES.keys():
            total_msg_with_media = USER_PROFILES[user].shape[0]
            media = list(USER_PROFILES[user].loc[
                USER_PROFILES[user].Msg.str.contains('<Media omitted>'),
                'Msg'].index)
            link = list(USER_PROFILES[user].loc[
                USER_PROFILES[user].Msg.str.contains('https'), 'Msg'].index)

            USER.append(user)
            TOTAL_MSG.append(total_msg_with_media - len(media))
            TOTAL_MEDIA.append(len(media))
            TOTAL_LINK.append(len(link))

        # Getting the emojies to append too with the user
        NAME = []
        EMOJIES = []
        EMOJIES_LEN = []
        for user in USER_PROFILES.keys():
            NAME.append(user)
            EMOJIES.append(
                list(demoji.findall(str(USER_PROFILES[user].Msg)).keys()))
            EMOJIES_LEN.append(
                len(list(demoji.findall(str(USER_PROFILES[user].Msg)).keys())))

        Stat_data = {
            "User": USER,
            "Total_msg": TOTAL_MSG,
            "Total_media": TOTAL_MEDIA,
            "Total_link": TOTAL_LINK,
            "Emojies": EMOJIES,
            "Total_emojies": EMOJIES_LEN
        }

        Stat_data_frame = pd.DataFrame(Stat_data)
        return Stat_data_frame

Exemple #3

0

Afficher le fichier

Fichier : preprocessing.py Projet : AmrEssamAbdulAzim/fake_news_Detection

def rep_emojis(t):
    d = demoji.findall(t)
    if d:
        for k, v in d.items():
            t = t.replace(k, ' xxemoji ')
            #t=t.replace(k,' xxemoji xx'+v.replace(' ','_')+' ')
    return t

Exemple #4

0

Afficher le fichier

Fichier : preprocess_query.py Projet : AmeyaKaranjkar/Similarity-Retrieval

def preProcessing(text):
    text = text.lower()

    lemmatizer = WordNetLemmatizer()
    words = text.split(' ')

    stopSet = stopwords.words('english')
    text = " ".join([i for i in words if i not in stopSet])

    #replace emoji
    words = text.split(' ')
    idx = 0
    for word in words:
        emojiDict = demoji.findall(word)
        emojiText = list(emojiDict.keys())
        if len(emojiText) > 0:
            words[idx] = emojiDict[emojiText[0]]
        idx += 1

    text = ''
    for w in words:
        nWord = w.replace('n\'t', 'not')
        text += ' ' + lemmatizer.lemmatize(nWord)

    #remove non-ascii characters
    text = unidecode(text)

    seperators = list(string.punctuation)
    for i in seperators:
        text = text.replace(i, ' ')

    #print(text)
    return text

Exemple #5

0

Afficher le fichier

Fichier : sentimentAnalyzer.py Projet : AykCanDem/Covid19_SentimentAnalysis

def emoji2text(connection, df_tweets):
    """ A function that replaces emoji with corresponding text inside the tweets"""

    cursor = connection.cursor()
    for i in range(len(df_tweets)):
        id = df_tweets.loc[i, 'ID']

        content = df_tweets.loc[i, 'TRCONTENT'] if df_tweets.loc[
            i, 'TRCONTENT'] else df_tweets.loc[i, 'CONTENT']

        #find the emojis inside the text, function returns emojis and corresponding text value
        emoDict = demoji.findall(content)
        convertedText = df_tweets.loc[i, 'FCONTENT']
        #replace all emojis with corresponding text
        if len(emoDict):
            print('ORIGINAL: ', content)
            for emo, emoText in emoDict.items():
                emoText = ' ' + emoText + ' '  #leading and panding spaces to separate emoji from other words
                convertedText = convertedText.replace(emo, emoText)
            df_tweets.loc[i, 'FCONTENT'] = convertedText
            print('CONVERTED: ', df_tweets.loc[i, 'FCONTENT'])

            query = 'UPDATE GoldenSet SET FCONTENT = "' + str(
                df_tweets.at[i, 'FCONTENT']) + '" WHERE ID = ' + str(id) + ";"
            cursor.execute(query)
            print('-' * 40 + '\n')

    cursor.close()
    print(
        "\n\n\n----------------------- EMOJI TO TEXT FINISHED -------------------------\n\n\n"
    )

Exemple #6

0

Afficher le fichier

Fichier : app.py Projet : nidhishs/sentiment-analysis

def demojify(input_text):
    text_emojis = {
        ':-*\)': 'smile',
        ':-*]': 'smile',
        ':-*d': 'smile',
        ':-*\(': 'frown',
        ':-*\[': 'frown',
        ':-*/': 'unsure',
        ':-*o': 'astonish',
        ':-*0': 'astonish',
        'xd': 'laugh',
        ';-*\)': 'wink',
        ":'\(": 'cry',
        ':3': 'smile',
        '&lt;3': 'love',
    }
    # Find all icon emojis
    icon_emojis = demoji.findall(input_text)
    emojis = {**text_emojis, **icon_emojis}

    for emoji, emoji_text in emojis.items():
        # Add extra space to avoid combining the text with the next word.
        # Extra space is removed later.
        input_text = re.sub(emoji, f' {emoji_text} ', input_text)

    return input_text

Exemple #7

0

Afficher le fichier

def df_punct(df, emoji='on'):

    # Import necessary libraries
    import re
    import demoji
    # demoji.download_codes()

    # Reset index
    df = df.reset_index(drop=True)

    # Delete all urls from the strings, which are almost solely used to retweet
    df['text'] = [re.sub(r'http\S+', "", txt) for txt in df['text']]

    # Locate retweets and assign a dummy variable to them
    df['rt'] = [1 if 'RT @' in txt else 0 for txt in df['text']]

    # Replace all emojis with word representations
    if emoji == 'on':
        big_str = ' '.join(df['text'])
        emj_dct = demoji.findall(big_str)
        for emoji in emj_dct:
            df['text'] = df['text'].str.replace(emoji, emj_dct[emoji])
    elif emoji == 'off':
        pass

    # Delete from the text strings 'rt' which indicates a Retweet
    df['text'] = [re.sub(r'rt', "", txt) for txt in df['text']]

    # Delete punctuation
    df['text'] = [
        re.sub(r'[^\w\s]', '',
               str(txt).lower().strip()) for txt in df['text']
    ]

    return df

Exemple #8

0

Afficher le fichier

def text_process(tweet):
    #removing mentions
    tweet = re.sub(r'@[A-Za-z0-9]+','' ,tweet, flags=re.MULTILINE)
    #removing url links
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    #removing numbers
    tweet = ''.join([i for i in tweet if not i.isdigit()])
    #converting some words to not
    tweet=re.sub(r"\bdidn't\b","not",tweet.lower())
    tweet=re.sub(r"\bdoesn't\b","not",tweet.lower())
    tweet=re.sub(r"\bdon't\b","not",tweet.lower())

    #converting emojis to their meaning
    #demoji.download_codes()
    l=demoji.findall(tweet)
    for key, value in l.items():
        tweet = tweet.replace(key, value)
    #removing puctuations
    nopunc = [char for char in tweet if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    #seperating words
    nopunc=' '.join(segment(nopunc))
    #returning the tweet without the stopwords
    tokens = [word for word in nopunc.split() if word.lower() not in sw]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

Exemple #9

0

Afficher le fichier

Fichier : formatter.py Projet : DiscordApps/logviewer

def _process_unicode_emojis(m, emoji_class):
    e = m.group()
    title = UNICODE_LIST.get(e, demoji.findall(e)[e])
    if '\u200d' not in e:  # If there isn't a zero width joiner, strip out variation selectors
        e = re.sub(r'[\U0000FE00-\U0000FE0F]$', '', e)
    codepoint = "-".join(['%04x' % ord(_c) for _c in e]).lstrip('0')
    return f'<img class="{emoji_class}" title=":{title}:" ' \
        f'src="https://twemoji.maxcdn.com/2/svg/{codepoint}.svg" alt="{e}">'

Exemple #10

0

Afficher le fichier

Fichier : preprocess.py Projet : Vasco27/Tweet2Story

def find_emojis(tweet_list):
    emoji_count = 0
    for tweet in tweet_list:
        ems = demoji.findall(tweet)
        if len(ems):
            print(ems)
            emoji_count += 1

    print(f"Found {emoji_count} tweets with emojis.")

Exemple #11

0

Afficher le fichier

Fichier : functions.py Projet : Anindyadeep/Whatsapp-Analysis-version-1.0.0

def get_stats_frame(raw_data):
    USER = []
    TOTAL_MSG = []
    TOTAL_MEDIA = []
    TOTAL_LINK = []

    users = {
        user: raw_data[raw_data.User == user]
        for user in raw_data.User.unique()
    }

    for user in users.keys():
        total_msg_with_media = users[user].shape[0]
        media = list(
            users[user].loc[users[user].Msg.str.contains('<Media omitted>'),
                            'Msg'].index)
        link = list(users[user].loc[users[user].Msg.str.contains('https'),
                                    'Msg'].index)
        USER.append(user)
        TOTAL_MSG.append(total_msg_with_media - len(media))
        TOTAL_MEDIA.append(len(media))
        TOTAL_LINK.append(len(link))

    # Getting the emojies to append too with the user
    NAME = []
    EMOJIES = []
    EMOJIES_LEN = []
    for user in users.keys():
        NAME.append(user)
        EMOJIES.append(list(demoji.findall(str(users[user].Msg)).keys()))
        EMOJIES_LEN.append(
            len(list(demoji.findall(str(users[user].Msg)).keys())))

    Stat_data = {
        "User": USER,
        "Total_msg": TOTAL_MSG,
        "Total_media": TOTAL_MEDIA,
        "Total_link": TOTAL_LINK,
        "Emojies": EMOJIES,
        "Total_emojies": EMOJIES_LEN
    }

    Stat_data_frame = pd.DataFrame(Stat_data)
    return Stat_data_frame

Exemple #12

0

Afficher le fichier

Fichier : preprocess.py Projet : shreyas-kowshik/nlp4if

def preprocess_cleaning(df):
    '''
    Convert non-ascii to ascii
    Count URL, emoji, punc, hashtag, mentions
    convert emoji to text
    convert hashtag using camel case
    lower text
    '''

    stop_words = stopwords.words('english')
    EMOJI_TO_TEXT = demoji.findall((' ').join(df['tweet_text'].to_list()))
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

    def lemmatize_words(text):
        pos_tagged_text = nltk.pos_tag(text.split())
        return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

    def clean_text(text):
        '''Make text lowercase, remove text in square brackets, remove links, remove user mention,
        remove punctuation, remove numbers and remove words containing numbers.'''
        
        text = re.sub('(#[A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', text))  # Split by camel case
        text = text.lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('@\w+', '', text) # mentions
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punc
        text = re.sub('\n', '', text)
        text = re.sub(r'(.)\1+', r'\1\1', text) # char repeated more than twice. ex hellllp -> hellp
        
        return text

    def emoji_to_text(text):
        return ' '.join([EMOJI_TO_TEXT.get(i, i) for i in text.split(' ')])

    df['num_url']=df['tweet_text'].apply(lambda x:x.count('URL'))
    df['num_user_id']=df['tweet_text'].apply(lambda x:x.count('USERID'))
    df['num_emoji'] = df['tweet_text'].apply(lambda x:len([i for i in x if i in EMOJI_TO_TEXT]))
    
    df['tweet_text']=df['tweet_text'].apply(lambda x:emoji_to_text(x))
    df['tweet_text']=df['tweet_text'].apply(lambda x:unidecode(x))
    df['tweet_text']=df['tweet_text'].apply(lambda x:lemmatize_words(x))
    
    df['has_url']=(df['num_url']>0).astype(int)
    df['has_emoji']=(df['num_emoji']>0).astype(int)
    df['num_hashtags'] = df['tweet_text'].str.findall(r'#(\w+)').apply(lambda x : len(x))
    df['num_user_mention'] = df['tweet_text'].str.findall(r'@(\w+)').apply(lambda x : len(x))
    df['num_punctuation'] = df['tweet_text'].str.replace(r'[\w\s#]+', '').apply(lambda x : len(x))
    

    df['text_cleaned'] = df['tweet_text'].apply(clean_text)
    # Remove stop words
    df['text_cleaned'] = df['text_cleaned'].str.split().apply(lambda x: [word for word in x if word not in stop_words]).apply(lambda x: ' '.join(x))
    
    return df

Exemple #13

0

Afficher le fichier

Fichier : emojis.py Projet : dmmolitor/twitter_analysis_tools

def contains_emoji(text):
    """Return True if text contains an emoji, False otherwise.

    Examples:
    >>> contains_emoji("Ъда coronavirus. ЪцњЪўиЪўиЪўи")
    True
    >>> contains_emoji("No emojis here. :( )")
    False
    """
    return bool(demoji.findall(text))

Exemple #14

0

Afficher le fichier

Fichier : TweepyCode.py Projet : ravi230195/Information-Retrival

def preprocessing(i, file, userList):
    tweet_dic = json.loads(i)
    #date
    date = time.strftime(
        '%Y-%m-%dT%H:00:00Z',
        time.strptime(tweet_dic["created_at"], "%a %b %d %H:%M:%S +0000 %Y"))
    format_str = '%Y-%m-%dT%H:00:00Z'
    dt = datetime.strptime(date, format_str)
    final = dt + timedelta(hours=1)
    #reply_text, poi_id, poi_name
    tweet_dic['tweet_date'] = final.strftime(format_str)
    if ('full_text' in tweet_dic.keys()):
        print("tweet has full_text")
    elif ('text' in tweet_dic.keys()):
        print("has text instead of full_text")
        tweet_dic['full_text'] = tweet_dic['text']
    else:
        print("No full_text or text")
        return
    if tweet_dic['in_reply_to_status_id'] is not None:
        #print("wkefnwlm")
        if tweet_dic['in_reply_to_screen_name'] not in userList:
            tweet_dic['poi_name'] = tweet_dic['user']['screen_name']
            tweet_dic['poi_id'] = tweet_dic['user']['id']
        else:
            tweet_dic['poi_name'] = tweet_dic['in_reply_to_screen_name']
            tweet_dic['poi_id'] = tweet_dic['in_reply_to_user_id']
        tweet_dic['reply_text'] = tweet_dic['full_text']
    else:
        #print("welfnwelknfdwm")
        #print(tweet_dic)
        tweet_dic['poi_name'] = tweet_dic['user']['screen_name']
        tweet_dic['poi_id'] = tweet_dic['user']['id']
        tweet_dic['reply_text'] = None
    print(tweet_dic['poi_name'])
    #country
    screen_name = tweet_dic['poi_name']
    tweet_dic['country'] = "India"
    if screen_name in india_list:
        tweet_dic['country'] = "India"
    elif screen_name in usa_list:
        tweet_dic['country'] = 'USA'
    elif screen_name in brazil_list:
        tweet_dic['country'] = 'brazil'
    else:
        print("error poi {}".format(tweet_dic['id']))
    #text_xx
    #text_xx = "text_" + str(tweet_dic['lang'])
    full_text = tweet_dic['full_text']
    tweet_dic['text_copy'] = demoji.replace(full_text)
    tweet_dic['tweet_emotions'] = list(demoji.findall(full_text).keys())
    #time.sleep(1)
    json.dump(tweet_dic, file, ensure_ascii=False)
    #time.sleep(5)
    file.write("\n")

Exemple #15

0

Afficher le fichier

Fichier : O5_smtd_preprocessing.py Projet : vbukkala/practical-nlp

def remove_emoji(transient_tweet_text):
    tweet_tokenizer = TweetTokenizer()
    tokenized_tweet = tweet_tokenizer.tokenize(transient_tweet_text)
    emojis_present = demoji.findall(transient_tweet_text)
    tweet_no_emoji=''
    for i,s in enumerate(tokenized_tweet):
        if s in emojis_present.keys():
            tweet_no_emoji = tweet_no_emoji + ' ' + emojis_present[s]
        else:
            tweet_no_emoji = tweet_no_emoji + ' ' + s
    return tweet_no_emoji

Exemple #16

0

Afficher le fichier

def rep_emojis(message):
    butter = []
    bandit = demoji.findall(message)
    no = [char for char in message]
    for word in no:
        if word in bandit.keys():
            butter.append(' ')
            butter.append(word.upper().replace(word, bandit[word]))
        else:
            butter.append(word)
    return ''.join(butter)

Exemple #17

0

Afficher le fichier

def emojiReplace_v2(text_string):
    emoji_dict = demoji.findall(text_string)    
    for emoji in emoji_dict.keys():
        #Making the connecting token between words a normal letter 'w' because BERT's tokenizer
        #splits on special tokens like '%' and '$'
        emoji_token = 'x'.join(re.split('\W+', emoji_dict[emoji])) + ' '
        text_string = text_string.replace(emoji, emoji_token)
        
        #Controlling for multiple emojis in a row
        pattern = '(' + emoji_token + ')' + '{2,}'
        text_string = re.sub(pattern, 'mult' + emoji_token + ' ', text_string)
    return text_string

Exemple #18

0

Afficher le fichier

Fichier : generate_csv.py Projet : jinzhao3611/Political_Stance_Prediction

def parse_emoji(sentence: str, sub: bool):
    emojis = demoji.findall(sentence)
    for emoji in emojis:
        if sub:
            desc = emj.demojize(emoji)
            desc = re.sub(r"[:_]", " ", desc).strip()
            sentence = sentence.replace(emoji, f" EMOJI:[{desc}] ")
        else:
            sentence = sentence.replace(emoji, f" {emoji} ")
    sentence = re.sub(r"\s{2,}", " ", sentence)

    return sentence

Exemple #19

0

Afficher le fichier

Fichier : my_new_flask_app.py Projet : Anmol567/Youtube_Comment_Reviewer

def cleanwords(sentence):
    sentence_emogis = demoji.findall(sentence)
    sentence_emogis_short = " "
    for value in sentence_emogis.values():
        sentence_emogis_short = sentence_emogis_short + (str(value) + " ")
    sentence = deEmojify(sentence)
    words = word_tokenize(sentence)
    words = [
        lemmatizer.lemmatize(word,
                             pos=get_simple_POS(pos_tag(word)[0][1])).lower()
        for word in words if not word.lower() in stop and not word.isdigit()
    ]
    return " ".join(words)

Exemple #20

0

Afficher le fichier

Fichier : eda.py Projet : Jiaweihu08/Stramlit-app-for-text-classification

	def replace_emojis(self):
		"""emojis/emoticons found:  """
		emojis = demoji.findall(self.text)
		if not emojis:
			return self.text, False

		tokenized_text = tokenizer.tokenize(text)
		for i, s in enumerate(tokenized_text):
			if s in emojis.keys():
				tokenized_text[i] = emojis[s]
		self.text = ' '.join(tokenized_text)
		self.text = self.erase_emojis()
		return self.text, list(emojis.keys())

Exemple #21

0

Afficher le fichier

def preprocessed(textData):
    allfile = 'lastFinishedPreprocess.csv'
    with open(textData, newline='') as csvfile:
        files = csv.reader(csvfile, delimiter=' ', quotechar='|')
        for text in files:
            newFile = np.char.lower(text)
            #print("lower: ", newFile)

            newFile = remove_stopwords(str(newFile))
            #print("stop: ", newFile)

            newFile = re.sub('http[s]?://\S+', '', str(newFile))
            #print("URL: ", newFile)

            newFile = np.char.replace(newFile, '\\n', '')
            newFile = np.char.replace(newFile, '\\r', '')

            symbols = "!#$%&()*+-"
            for i in symbols:
                newFile = np.char.replace(newFile, i, ' ')
            symbols = "!./:;<=>?@[\]^_`{|}~"
            for i in range(len(symbols)):
                newFile = np.char.replace(newFile, symbols[i], ' ')
            newFile = np.char.replace(newFile, ',', '')
            newFile = np.char.replace(newFile, "  ", " ")
            #print("punc: ", newFile)
            #print (newFile)
            newFile = np.char.replace(str(newFile), "’", "")

            newFile = re.sub(r"\b[a-zA-Z]\b", "", str(newFile))
            '''ps = PorterStemmer()
            newFile = [ps.stem(word) for word in newFile]'''

            lemmatizer = WordNetLemmatizer()
            newFile = word_tokenize(newFile)
            newFile = [lemmatizer.lemmatize(word) for word in newFile]

            newFile = ' '.join([
                num2words.num2words(i) if i.isdigit() else i for i in newFile
            ])

            for i in newFile:
                emojis = demoji.findall(i)
                if i in emojis:
                    newFile = newFile.replace(i, emojis[i])
                #print(row)
            newFile = ''.join(i for i in newFile)

            with open(allfile, 'a', newline='') as csvfile:
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow([newFile])

Exemple #22

0

Afficher le fichier

Fichier : falconet_keyword_robustness_ranker.py Projet : kharrigian/covid-mental-health

def replace_emojis(features):
    """

    """
    features_clean = []
    for f in features:
        f_res = demoji.findall(f)
        if len(f_res) > 0:
            for x, y in f_res.items():
                f = f.replace(x, f"<{y}>")
            features_clean.append(f)
        else:
            features_clean.append(f)
    return features_clean

Exemple #23

0

Afficher le fichier

Fichier : preprocess.py Projet : karenacorn99/Irony-Detection-in-English-Tweets

def preprocess(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    emojis = demoji.findall(text)
    cleaned = []
    for token in tokens:
        if 'http' in token:
            cleaned.append('[URL]')
        elif '@' in token:
            cleaned.append('[USER]')
        elif token in emojis:
            cleaned.append(':' + ''.join(emojis[token].split()) + ':')
        else:
            cleaned.append(token.lower())
    return ' '.join(cleaned)

Exemple #24

0

Afficher le fichier

    async def poll(self, ctx, *, arg):
        emoji = []

        if 'votes=' in arg:
            return await send_message(
                ctx,
                '''You don't need to do votes= for emoji anymore, I'll pull them automatically.''',
                error=True)

        # emoji = list(re.findall(emojiPattern, arg, flags=re.DOTALL)) + list(re.findall(customEmojiPattern, arg, flags=re.DOTALL))
        emoji = list(demoji.findall(arg).keys()) + list(
            re.findall(customEmojiPattern, arg, flags=re.DOTALL))
        msg = await ctx.send(f"**Poll time! <@{ctx.author.id}> asks:**\n{arg}")
        for reaction in emoji:
            await msg.add_reaction(reaction.strip('<> '))

Exemple #25

0

Afficher le fichier

def emoji(textData): #for some reason us flag not being converted
    noEmo = 'tempEmo.csv'
    with open(textData, newline='') as csvfile:
        files = csv.reader(csvfile,delimiter=' ', quotechar='|')
        for text in files:
            for row in text:
                #print(row)
                for i in row:
                    emojis = demoji.findall(i)
                    if i in emojis:
                        row = row.replace(i,emojis[i])
                #print(row)
                row = ' '.join(i for i in row.split())
                with open(noEmo,'a',newline='') as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow([row])

Exemple #26

0

Afficher le fichier

Fichier : utils.py Projet : avesh-singh/sentiment-analysis-api

def clean_sentence(sentence):
    if re.search(hashtag, sentence) is not None:
        sentence = re.sub(hashtag, r'\1', sentence)
    sentence = re.sub(links, 'URL', sentence)
    reference = demoji.findall(sentence)
    # print(reference)
    emoticons = emot.emoticons(sentence)
    if isinstance(emoticons, list):
        emoticons = emoticons[0]
    # print(emoticons)
    if len(reference) > 0:
        for key, value in reference.items():
            sentence = sentence.replace(key, value+" ")
    if emoticons['flag']:
        for i in range(len(emoticons['value'])):
            # print(emoticons['value'][i])
            sentence = sentence.replace(emoticons['value'][i], extract_emotion(emoticons['mean'][i]))
    return sentence

Exemple #27

0

Afficher le fichier

Fichier : JSON_parser.py Projet : Sriparna190393/Information-Retrieval-CSE-535-University-at-Buffalo-New-York

def get_emoji_list(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE)

    emoticons_dict = demoji.findall(string)
    if not emoticons_dict is null:
        emo_list = []
        for emo in emoticons_dict.keys():
            emo_list.append(emo)
    return emo_list

Exemple #28

0

Afficher le fichier

def clean_tweets(tweet, translate=False, tokenize=False):
    '''process tweets:
    - tweet processor to get rid of url links
    - remove punctuations
    - translate to english
    - to lowercase
    - remove common stopwords
    - stem instead of lemmitize (for faster process, although draw back is the stemmed words sometime do not look like a real word)
    - process emojis i.g. :) --> smily face'''
    tweet = str(tweet)  # sometimes not a string type i.e. float
    orig_tweet = copy.deepcopy(tweet)
    # remove urls,  mentions, and hashtags
    try:
        tweet = p.clean(tweet)
    except Exception as e:
        print(e)
    # 1. remove non-letter or space.
    tweet = re.sub('[^[a-z|A-Z|\s]*', '', tweet)
    if translate:
        try:
            # translate to English
            tweet = translate(tweet)
        except json.decoder.JSONDecodeError:
            print(tweet)

    stop = stopwords.words('english')  # common stop words
    stop_words = set(stop)
    # 2. convert to lower case and tokenize
    tweet = tweet.lower()
    tweet_tokens = word_tokenize(tweet)
    # 3. remove stopwords, Stemming
    ret = []
    for word in tweet_tokens:
        if not word in stop_words:
            stemmer = PorterStemmer()
            word = stemmer.stem(word)
            ret.append(word)
    # 4. append emojis
    ret.extend(list(demoji.findall(orig_tweet).values()))
    if not tokenize:
        ret = ' '.join(ret)
    return ret

Exemple #29

0

Afficher le fichier

def clean_sentence(sentence):
    """
    replaces all emojis and emoticons with their text equivalent
    :param sentence: str, raw text
    :return: clean text
    """
    reference = demoji.findall(sentence)
    # print(reference)
    emoticons = emot.emoticons(sentence)
    if isinstance(emoticons, list):
        emoticons = emoticons[0]
    # print(emoticons)
    if len(reference) > 0:
        for key, value in reference.items():
            sentence = sentence.replace(key, value+" ")
    if emoticons['flag']:
        for i in range(len(emoticons['value'])):
            # print(emoticons['value'][i])
            sentence = sentence.replace(emoticons['value'][i], extract_emotion(emoticons['mean'][i]))
    return sentence

Exemple #30

0

Afficher le fichier

def emoji(cleaned_text, Emoji_emo):
    emoji = demoji.findall(cleaned_text)
    score = pd.DataFrame(np.zeros(shape=(1, 6)).astype(int),
                         columns=[
                             'Happy', 'Excitement', 'Pleasant', 'Surprise',
                             'Fear', 'Angry'
                         ])
    for emo in emoji:
        if Emoji_emo["Happy"].str.contains(emoji[emo]).sum() > 0:
            score["Happy"] += 1
        if Emoji_emo["Excitement"].str.contains(emoji[emo]).sum() > 0:
            score["Excitement"] += 1
        if Emoji_emo["Pleasant"].str.contains(emoji[emo]).sum() > 0:
            score["Pleasant"] += 1
        if Emoji_emo["Surprise"].str.contains(emoji[emo]).sum() > 0:
            score["Surprise"] += 1
        if Emoji_emo["Fear"].str.contains(emoji[emo]).sum() > 0:
            score["Fear"] += 1
        if Emoji_emo["Angry"].str.contains(emoji[emo]).sum() > 0:
            score["Angry"] += 1
    return (score)