Beispiel #1
0
def test_emoji_list():
    assert emoji.emoji_list('Hi, I am 👌 test')[0]['match_start'] == 9
    assert emoji.emoji_list('Hi') == []
    if len('Hello 🇫🇷👌') < 10:  # skip these tests on python with UCS-2 as the string length/positions are different
        assert emoji.emoji_list('Hi, I am fine. 😁') == [
            {'match_start': 15, 'match_end': 16, 'emoji': '😁'}]
        assert emoji.emoji_list('Hello 🇫🇷👌') == [
            {'emoji': '🇫🇷', 'match_start': 6, 'match_end': 8}, {'emoji': '👌', 'match_start': 8, 'match_end': 9}]
Beispiel #2
0
def format_message(body, is_quote=False):
    """Format message by processing all characters.

    - Wrap emoji in <span> for styling them
    - Escape special HTML chars
    """
    emoji_pos = emoji_list(body)
    new_body = ""
    emoji_lookup = {p["location"]: p["emoji"] for p in emoji_pos}
    skip = 0
    for i, c in enumerate(body):
        if skip > 0:
            # Skip additional characters from multi-character emoji
            skip = skip - 1
        elif i in emoji_lookup:
            new_body += "<span class='msg-emoji'>%s</span>" % emoji_lookup[i]
            skip = len(emoji_lookup[i]) - 1
        elif c == "&":
            new_body += "&amp;"
        elif c == "<":
            new_body += "&lt;"
        elif c == ">":
            new_body += "&gt;"
        else:
            new_body += c
    return new_body
Beispiel #3
0
def main():
    out_text = open(tweets_file + ".text", 'w')
    out_labels = open(tweets_file + ".labels", 'w')
    tot = 0
    ok = 0
    with open(tweets_file) as f_in:
        for line in f_in:
            if ".json" in tweets_file:
                j = json.loads(line)
                text = j['text'].replace("\n", "")
            else:
                text = line

            emo_list = emoji.emoji_list(text)
            emo_set = set([d['code'] for d in emo_list if 'code' in d])
            if len(emo_set) == 1:
                emo = emo_set.pop().encode('utf_8')
                if emo in mapping:
                    ct = clean_text(text)
                    out_text.write(ct + "\n")
                    out_labels.write(mapping[emo] + "\n")
                    ok += 1
                    #print "-------------------------------"
                    #print text
                    #print clean_text(text)
                    #print mapping[emo]
            if tot % 10000 == 0:
                print str(tot)
            tot += 1

    print str(ok) + " good examples out of " + str(tot)

    out_text.close()
    out_labels.close()
Beispiel #4
0
def format_emoji(body, is_quote=False):
    """ Wrap emoji in <span> so we can style it easily """
    emoji_pos = emoji_list(body)
    new_body = ""
    emoji_lookup = {p["location"]: p["emoji"] for p in emoji_pos}
    for i, c in enumerate(body):
        if i in emoji_lookup:
            new_body += "<span class='msg-emoji'>%s</span>" % emoji_lookup[i]
        else:
            new_body += c
    return new_body
Beispiel #5
0
def format_message(body, mentions={}):
    """Format message by processing all characters.

    - Wrap emoji in <span> for styling them
    - Escape special HTML chars
    """
    if body is None:
        return None

    emoji_pos = emoji_list(body)
    new_body = ""
    emoji_lookup = {p["location"]: p["emoji"] for p in emoji_pos}
    skip = 0
    for i, c in enumerate(body):
        if skip > 0:
            # Skip additional characters from multi-character emoji
            skip = skip - 1
        elif i in emoji_lookup:
            new_body += "<span class='msg-emoji'>%s</span>" % emoji_lookup[i]
            skip = len(emoji_lookup[i]) - 1
        elif c == "&":
            new_body += "&amp;"
        elif c == "<":
            new_body += "&lt;"
        elif c == ">":
            new_body += "&gt;"
        elif c == "\ufffc":  # Object replacement character
            mention = mentions.get(i)
            if mention:
                new_body += (
                    "<span class='msg-mention'>@%s</span>"
                    % format_message(mention.name)
                )
                skip = (
                    mention.length - 1
                )  # Not clear in what case this is not 1
            else:
                new_body += c
        else:
            new_body += c
    return new_body
Beispiel #6
0
def is_all_emoji(body):
    """ Check if a message is non-empty and only contains emoji """
    body = body.replace(" ", "").replace("\ufe0f", "")
    return len(emoji_list(body)) == len(body) and len(body) > 0
Beispiel #7
0
def test_text():
    UCS2 = len('Hello 🇫🇷👌') > 9  # don't break up characters on python with UCS-2

    text = u"""Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat in reprehenderit in cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
Stróż pchnął kość w quiz gędźb vel fax myjń.
Høj bly gom vandt fræk sexquiz på wc.
Съешь же ещё этих мягких французских булок, да выпей чаю.
За миг бях в чужд плюшен скърцащ фотьойл.
هلا سكنت بذي ضغثٍ فقد زعموا — شخصت تطلب ظبياً راح مجتازا
שפן אכל קצת גזר בטעם חסה, ודי
ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले विष्णुवतार भगवान श्रीराम, अयोध्या के महाराज दशरथ के बड़े सपुत्र थे।
とりなくこゑす ゆめさませ みよあけわたる ひんかしを そらいろはえて おきつへに ほふねむれゐぬ もやのうち
視野無限廣,窗外有藍天
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""

    def add_random_emoji(text, lst, select=lambda emj_data: emj_data['en']):

        emoji_list = []
        text_with_unicode = u""
        text_with_placeholder = u""
        for i in range(0, len(text), 10):
            while True:
                emj, emj_data = random.choice(lst)
                placeholder = select(emj_data)
                if placeholder:
                    break

            if UCS2:
                j = text.find(u" ", i, i + 10)
                if j == -1:
                    continue
            else:
                j = random.randint(i, i + 10)

            text_with_unicode += text[i:j]
            text_with_unicode += emj
            text_with_unicode += text[j:i + 10]

            text_with_placeholder += text[i:j]
            text_with_placeholder += placeholder
            text_with_placeholder += text[j:i + 10]

            emoji_list.append(emj)

        return text_with_unicode, text_with_placeholder, emoji_list

    def clean(s):
        return s.replace(u'\u200d', '').replace(u'\ufe0f', '')

    all_emoji_list = list(emoji.EMOJI_DATA.items())
    qualified_emoji_list = [(emj, item) for emj, item in emoji.EMOJI_DATA.items() if item['status'] == emoji.STATUS['fully_qualified']]

    # qualified emoji
    text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list)
    assert emoji.demojize(text_with_unicode) == text_with_placeholder
    assert emoji.emojize(text_with_placeholder) == text_with_unicode
    if not UCS2:
        assert emoji.replace_emoji(text_with_unicode, u'') == text
    assert set(emoji.distinct_emoji_list(text_with_unicode)) == set(emoji_list)
    for i, lis in enumerate(emoji.emoji_list(text_with_unicode)):
        assert lis['emoji'] == emoji_list[i]

    # qualified emoji from "es"
    selector = lambda emoji_data: emoji_data["es"] if "es" in emoji_data else False
    text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list, selector)
    assert emoji.demojize(text_with_unicode, language="es") == text_with_placeholder
    assert emoji.emojize(text_with_placeholder, language="es") == text_with_unicode
    if not UCS2:
        assert emoji.replace_emoji(text_with_unicode, u'') == text
    assert set(emoji.distinct_emoji_list(text_with_unicode)) == set(emoji_list)
    for i, lis in enumerate(emoji.emoji_list(text_with_unicode)):
        assert lis['emoji'] == emoji_list[i]

    # qualified emoji from "alias"
    selector = lambda emoji_data: emoji_data["alias"][0] if "alias" in emoji_data else False
    text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list, selector)
    assert emoji.demojize(text_with_unicode, language="alias") == text_with_placeholder
    assert emoji.emojize(text_with_placeholder, language="alias") == text_with_unicode
    if not UCS2:
        assert emoji.replace_emoji(text_with_unicode, u'') == text
    assert set(emoji.distinct_emoji_list(text_with_unicode)) == set(emoji_list)
    for i, lis in enumerate(emoji.emoji_list(text_with_unicode)):
        assert lis['emoji'] == emoji_list[i]

    # all emoji
    text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, all_emoji_list)
    assert emoji.demojize(text_with_unicode) == text_with_placeholder
    assert clean(emoji.emojize(text_with_placeholder)) == clean(text_with_unicode)
    if not UCS2:
        assert emoji.replace_emoji(text_with_unicode, u'') == text
    assert set(emoji.distinct_emoji_list(text_with_unicode)) == set(emoji_list)
    for i, lis in enumerate(emoji.emoji_list(text_with_unicode)):
        assert lis['emoji'] == emoji_list[i]