def test_emoji_list(): assert emoji.emoji_list('Hi, I am 👌 test')[0]['match_start'] == 9 assert emoji.emoji_list('Hi') == [] if len('Hello 🇫🇷👌') < 10: # skip these tests on python with UCS-2 as the string length/positions are different assert emoji.emoji_list('Hi, I am fine. 😁') == [ {'match_start': 15, 'match_end': 16, 'emoji': '😁'}] assert emoji.emoji_list('Hello 🇫🇷👌') == [ {'emoji': '🇫🇷', 'match_start': 6, 'match_end': 8}, {'emoji': '👌', 'match_start': 8, 'match_end': 9}]
def format_message(body, is_quote=False): """Format message by processing all characters. - Wrap emoji in <span> for styling them - Escape special HTML chars """ emoji_pos = emoji_list(body) new_body = "" emoji_lookup = {p["location"]: p["emoji"] for p in emoji_pos} skip = 0 for i, c in enumerate(body): if skip > 0: # Skip additional characters from multi-character emoji skip = skip - 1 elif i in emoji_lookup: new_body += "<span class='msg-emoji'>%s</span>" % emoji_lookup[i] skip = len(emoji_lookup[i]) - 1 elif c == "&": new_body += "&" elif c == "<": new_body += "<" elif c == ">": new_body += ">" else: new_body += c return new_body
def main(): out_text = open(tweets_file + ".text", 'w') out_labels = open(tweets_file + ".labels", 'w') tot = 0 ok = 0 with open(tweets_file) as f_in: for line in f_in: if ".json" in tweets_file: j = json.loads(line) text = j['text'].replace("\n", "") else: text = line emo_list = emoji.emoji_list(text) emo_set = set([d['code'] for d in emo_list if 'code' in d]) if len(emo_set) == 1: emo = emo_set.pop().encode('utf_8') if emo in mapping: ct = clean_text(text) out_text.write(ct + "\n") out_labels.write(mapping[emo] + "\n") ok += 1 #print "-------------------------------" #print text #print clean_text(text) #print mapping[emo] if tot % 10000 == 0: print str(tot) tot += 1 print str(ok) + " good examples out of " + str(tot) out_text.close() out_labels.close()
def format_emoji(body, is_quote=False): """ Wrap emoji in <span> so we can style it easily """ emoji_pos = emoji_list(body) new_body = "" emoji_lookup = {p["location"]: p["emoji"] for p in emoji_pos} for i, c in enumerate(body): if i in emoji_lookup: new_body += "<span class='msg-emoji'>%s</span>" % emoji_lookup[i] else: new_body += c return new_body
def format_message(body, mentions={}): """Format message by processing all characters. - Wrap emoji in <span> for styling them - Escape special HTML chars """ if body is None: return None emoji_pos = emoji_list(body) new_body = "" emoji_lookup = {p["location"]: p["emoji"] for p in emoji_pos} skip = 0 for i, c in enumerate(body): if skip > 0: # Skip additional characters from multi-character emoji skip = skip - 1 elif i in emoji_lookup: new_body += "<span class='msg-emoji'>%s</span>" % emoji_lookup[i] skip = len(emoji_lookup[i]) - 1 elif c == "&": new_body += "&" elif c == "<": new_body += "<" elif c == ">": new_body += ">" elif c == "\ufffc": # Object replacement character mention = mentions.get(i) if mention: new_body += ( "<span class='msg-mention'>@%s</span>" % format_message(mention.name) ) skip = ( mention.length - 1 ) # Not clear in what case this is not 1 else: new_body += c else: new_body += c return new_body
def is_all_emoji(body): """ Check if a message is non-empty and only contains emoji """ body = body.replace(" ", "").replace("\ufe0f", "") return len(emoji_list(body)) == len(body) and len(body) > 0
def test_text(): UCS2 = len('Hello 🇫🇷👌') > 9 # don't break up characters on python with UCS-2 text = u"""Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat in reprehenderit in cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Stróż pchnął kość w quiz gędźb vel fax myjń. Høj bly gom vandt fræk sexquiz på wc. Съешь же ещё этих мягких французских булок, да выпей чаю. За миг бях в чужд плюшен скърцащ фотьойл. هلا سكنت بذي ضغثٍ فقد زعموا — شخصت تطلب ظبياً راح مجتازا שפן אכל קצת גזר בטעם חסה, ודי ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले विष्णुवतार भगवान श्रीराम, अयोध्या के महाराज दशरथ के बड़े सपुत्र थे। とりなくこゑす ゆめさませ みよあけわたる ひんかしを そらいろはえて おきつへに ほふねむれゐぬ もやのうち 視野無限廣,窗外有藍天 Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. """ def add_random_emoji(text, lst, select=lambda emj_data: emj_data['en']): emoji_list = [] text_with_unicode = u"" text_with_placeholder = u"" for i in range(0, len(text), 10): while True: emj, emj_data = random.choice(lst) placeholder = select(emj_data) if placeholder: break if UCS2: j = text.find(u" ", i, i + 10) if j == -1: continue else: j = random.randint(i, i + 10) text_with_unicode += text[i:j] text_with_unicode += emj text_with_unicode += text[j:i + 10] text_with_placeholder += text[i:j] text_with_placeholder += placeholder text_with_placeholder += text[j:i + 10] emoji_list.append(emj) return text_with_unicode, text_with_placeholder, emoji_list def clean(s): return s.replace(u'\u200d', '').replace(u'\ufe0f', '') all_emoji_list = list(emoji.EMOJI_DATA.items()) qualified_emoji_list = [(emj, item) for emj, item in emoji.EMOJI_DATA.items() if item['status'] == emoji.STATUS['fully_qualified']] # qualified emoji text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list) assert emoji.demojize(text_with_unicode) == text_with_placeholder assert emoji.emojize(text_with_placeholder) == text_with_unicode if not UCS2: assert emoji.replace_emoji(text_with_unicode, u'') == text assert set(emoji.distinct_emoji_list(text_with_unicode)) == set(emoji_list) for i, lis in enumerate(emoji.emoji_list(text_with_unicode)): assert lis['emoji'] == emoji_list[i] # qualified emoji from "es" selector = lambda emoji_data: emoji_data["es"] if "es" in emoji_data else False text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list, selector) assert emoji.demojize(text_with_unicode, language="es") == text_with_placeholder assert emoji.emojize(text_with_placeholder, language="es") == text_with_unicode if not UCS2: assert emoji.replace_emoji(text_with_unicode, u'') == text assert set(emoji.distinct_emoji_list(text_with_unicode)) == set(emoji_list) for i, lis in enumerate(emoji.emoji_list(text_with_unicode)): assert lis['emoji'] == emoji_list[i] # qualified emoji from "alias" selector = lambda emoji_data: emoji_data["alias"][0] if "alias" in emoji_data else False text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list, selector) assert emoji.demojize(text_with_unicode, language="alias") == text_with_placeholder assert emoji.emojize(text_with_placeholder, language="alias") == text_with_unicode if not UCS2: assert emoji.replace_emoji(text_with_unicode, u'') == text assert set(emoji.distinct_emoji_list(text_with_unicode)) == set(emoji_list) for i, lis in enumerate(emoji.emoji_list(text_with_unicode)): assert lis['emoji'] == emoji_list[i] # all emoji text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, all_emoji_list) assert emoji.demojize(text_with_unicode) == text_with_placeholder assert clean(emoji.emojize(text_with_placeholder)) == clean(text_with_unicode) if not UCS2: assert emoji.replace_emoji(text_with_unicode, u'') == text assert set(emoji.distinct_emoji_list(text_with_unicode)) == set(emoji_list) for i, lis in enumerate(emoji.emoji_list(text_with_unicode)): assert lis['emoji'] == emoji_list[i]