Beispiel #1
0
def test_deepmoji_return_attention():
    # test the output of the normal model
    model = deepmoji_emojis(maxlen=30, weight_path=PRETRAINED_PATH)
    # check correct number of outputs
    assert 1 == len(model.outputs)
    # check model outputs come from correct layers
    assert [['softmax', 0, 0]] == model.get_config()['output_layers']
    # ensure that output shapes are correct (assume a 5-example batch of 30-timesteps)
    input_shape = (5, 30, 2304)
    assert (5, 2304) == model.layers[6].compute_output_shape(input_shape)

    # repeat above described tests when returning attention weights
    model = deepmoji_emojis(maxlen=30, weight_path=PRETRAINED_PATH, return_attention=True)
    assert 2 == len(model.outputs)
    assert [['softmax', 0, 0], ['attlayer', 0, 1]] == model.get_config()['output_layers']
    assert [(5, 2304), (5, 30)] == model.layers[6].compute_output_shape(input_shape)
Beispiel #2
0
def test_deepmoji_return_attention():
    # test the output of the normal model
    model = deepmoji_emojis(maxlen=30, weight_path=PRETRAINED_PATH)
    # check correct number of outputs
    assert 1 == len(model.outputs)
    # check model outputs come from correct layers
    assert [['softmax', 0, 0]] == model.get_config()['output_layers']
    # ensure that output shapes are correct (assume a 5-example batch of 30-timesteps)
    input_shape = (5, 30, 2304)
    assert (5, 2304) == model.layers[6].compute_output_shape(input_shape)

    # repeat above described tests when returning attention weights
    model = deepmoji_emojis(maxlen=30, weight_path=PRETRAINED_PATH, return_attention=True)
    assert 2 == len(model.outputs)
    assert [['softmax', 0, 0], ['attlayer', 0, 1]] == model.get_config()['output_layers']
    assert [(5, 2304), (5, 30)] == model.layers[6].compute_output_shape(input_shape)
def main():
    df = pd.read_csv('../data/interim/sentences.csv')

    maxlen = 30
    batch_size = 32

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    sentences = []
    for sent in df.body.tolist():
        sent = unicode(str(sent), "utf-8")
        if sent.strip() == "":
            sent = 'blank'
            sent = unicode(str(sent), "utf-8")
        sentences.append(sent)

    tokenized, _, _ = st.tokenize_sentences(sentences)

    # generate full deepmoji features for sentences
    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Encoding texts with deepmoji features...')
    encoding = model.predict(tokenized)

    deepmoji_encodings = pd.DataFrame(encoding)
    deepmoji_encodings.index = df.post_id

    deepmoji_post_scores = deepmoji_encodings.groupby('post_id').agg(
        ['mean', 'max', 'min'])
    deepmoji_post_scores = flatten_cols(deepmoji_post_scores)
    deepmoji_post_scores = deepmoji_post_scores.add_prefix('deepmoji_')

    # generate 64 emoji encodings
    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running emoji predictions...')
    prob = model.predict(tokenized)
    emoji_scores = pd.DataFrame(prob)
    emoji_scores = emoji_scores.add_prefix('emoji_')
    emoji_scores.index = df.post_id

    emoji_post_scores = emoji_scores.groupby('post_id').agg(
        ['mean', 'max', 'min'])
    emoji_post_scores = flatten_cols(emoji_post_scores)

    print('deepmoji features shape: {}'.format(deepmoji_post_scores.shape))
    print('emoji features shape: {}'.format(emoji_post_scores.shape))
    total_feats = deepmoji_post_scores.merge(emoji_post_scores,
                                             left_index=True,
                                             right_index=True)
    print('total features shape: {}'.format(total_feats.shape))
    total_feats.to_csv('../data/interim/all_sent_level_deepmoji.csv')
Beispiel #4
0
def scoreTexts(TEST_SENTENCES):
    global vocabulary, model

    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    if model == None:
        model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
        model.summary()

    prob = model.predict(tokenized)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.
    scores = []
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = {}
        t_score["text"] = t
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        #t_score["prob"]=sum(t_prob[ind_top])

        emoji_score = {}
        for ind in ind_top:
            emoji_score[ind] = t_prob[ind]
        t_score["score"] = emoji_score
        scores.append(t_score)
    return scores
    def __init__(self):
        self.maxlen = 30

        self.load_mappings()

        print('Loading model from {}.'.format(PRETRAINED_PATH))
        self.model = deepmoji_emojis(self.maxlen, PRETRAINED_PATH)
        self.model.summary()
Beispiel #6
0
def model_deep(language):
    maxlen = 30
    batch_size = 32
    #list_new = []
    #list_new.append(language)
    answer = [unicode(item) for item in language]
    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(answer)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.
    print('Writing results to {}'.format(OUTPUT_PATH))
    scores = []
    for i, t in enumerate(answer):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        t_score.append(sum(t_prob[ind_top]))
        t_score.extend(ind_top)
        t_score.extend([t_prob[ind] for ind in ind_top])
        scores.append(t_score)
        print(t_score)

    with open(OUTPUT_PATH, 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', lineterminator='\n')
        writer.writerow([
            'Text', 'Top5%', 'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4',
            'Emoji_5', 'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'
        ])
        for i, row in enumerate(scores):
            try:
                writer.writerow(row)
            except Exception:
                print("Exception at row {}!".format(i))

    print(scores)
    return ''.join(str(e) for e in scores)
Beispiel #7
0
def emoji_predict(sen_list,
                  maxlen=30,
                  step=32,
                  model_path='../model/deepmoji_weights.hdf5',
                  vocab_path='../model/vocabulary.json'):
    model = deepmoji_emojis(maxlen, model_path)
    model.summary()

    with open(vocab_path, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary,
                           maxlen,
                           ignore_sentences_with_only_custom=True)
    records = []

    for i in range(0, len(sen_list), step):
        if i + step >= len(sen_list):
            tokenized, _, _ = st.tokenize_sentences(sen_list[i:len(sen_list)])
            content = sen_list[i:len(sen_list)]
            if len(tokenized) != len(content):
                print('Skip ' + str(i))
                continue
        else:
            tokenized, _, _ = st.tokenize_sentences(sen_list[i:i + step])
            content = sen_list[i:i + step]
            if len(tokenized) != len(content):
                print('Skip ' + str(i))
                continue
        prob = model.predict(tokenized)
        for j in range(len(content)):
            r = {}
            r['text'] = [content[j]]
            t_prob = prob[j]
            ind_top = top_elements(t_prob, 5)
            r['confidence'] = (str(sum(t_prob[ind_top])))
            r['top5emoji'] = [unicode(emoji_list[ind]) for ind in ind_top]
            r['top5prob'] = [str(t_prob[ind]) for ind in ind_top]
            r['prob'] = [str(num) for num in t_prob]
            records.append(r)
        if i % 1024 == 0:
            print('Processing: ' + str(i) + '/' + str(len(sen_list)))

    return records
Beispiel #8
0
def predict_emoji(training_data, maxlen):
    '''
    predicts the emojis commonly associated with the sentences then adds it to the
    :param sentences: list of sentences to predict
    :param maxlen: max length of the setences given
    :return:
    '''
    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    sentences = training_data['sentence']

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized, batch_size=500)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.
    # print('Writing results to {}'.format(OUTPUT_PATH))
    # scores = []
    # for i, t in enumerate(sentences):
    #     t_tokens = tokenized[i]
    #     t_score = [t]
    #     t_prob = prob[i]
    #     ind_top = top_elements(t_prob, 5)
    #     t_score.append(sum(t_prob[ind_top]))
    #     t_score.extend(ind_top)
    #     t_score.extend([t_prob[ind] for ind in ind_top])
    #     scores.append(t_score)
    #     print(t_score)

    return prob
Beispiel #9
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        u'I love mom\'s cooking',
        u'I love how you never reply back..',
        u'I love cruising with my homies',
        u'I love messing with yo mind!!',
        u'I love you and now you\'re just gone..',
        u'This is shit',
        u'This is the shit'
    ]

    expected = [
        np.array([36,  4,  8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31,  6, 30, 15, 13]),
        np.array([54, 44,  9, 50, 49]),
        np.array([46,  5, 27, 35, 34]),
        np.array([55, 32, 27,  1, 37]),
        np.array([48, 11,  6, 31,  9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokenized, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = deepmoji_emojis(maxlen=30, weight_path=PRETRAINED_PATH)
    prob = model.predict(tokenized)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(prob):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
def emoticonit(sen):
    TEST_SENTENCES = [unicode(sen)]

    maxlen = 30
    batch_size = 32

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
        st = SentenceTokenizer(vocabulary, maxlen)
        tokenized = st.tokenize_sentences(TEST_SENTENCES)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.

    scores = []
    selected = []
    num = 1
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, num)
        t_score.append(sum(t_prob[ind_top]))
        t_score.extend(ind_top)
        ind = ind_top.tolist()  #list
        for i in range(num):
            print(emoticons[ind[i]])
            selected.append(emoticons[ind[i]])
        t_score.extend([t_prob[ind] for ind in ind_top])
        scores.append(t_score)
        print(t_score)
    return (selected)
Beispiel #11
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        u'I love mom\'s cooking',
        u'I love how you never reply back..',
        u'I love cruising with my homies',
        u'I love messing with yo mind!!',
        u'I love you and now you\'re just gone..',
        u'This is shit',
        u'This is the shit'
    ]

    expected = [
        np.array([36, 4, 8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31, 6, 30, 15, 13]),
        np.array([54, 44, 9, 50, 49]),
        np.array([46, 5, 27, 35, 34]),
        np.array([55, 32, 27, 1, 37]),
        np.array([48, 11, 6, 31, 9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokenized, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = deepmoji_emojis(maxlen=30, weight_path=PRETRAINED_PATH)
    prob = model.predict(tokenized)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(prob):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
Beispiel #12
0
def predict_emoji(training_data, maxlen):
    '''
    predicts the emojis commonly associated with the sentences then adds it to the
    :param sentences: list of sentences to predict
    :param maxlen: max length of the setences given
    :return:
    '''

    sentences = training_data

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized, batch_size=100)

    return prob
Beispiel #13
0
                     most_n: int = 5,
                     min_dist: float = None) -> List[str]:
    tokenized, _, _ = st.tokenize_sentences([sentence])
    prob = deepmoji_model.predict(tokenized)
    for i, t_prob in enumerate(prob):
        if min_dist is not None:
            ids = list(i for i in top_elements(t_prob, most_n)
                       if i in elements_past_min(t_prob, min_dist))
        else:
            ids = list(top_elements(t_prob, most_n))
        return list([EMOJI_MAP[emoji_index] for emoji_index in ids])


sentence_tokenizer = SentenceTokenizer(get_vocabulary(), 30)
deepmoji_model = deepmoji_emojis(
    maxlen=30,
    weight_path=PRETRAINED_PATH,
)

deepmoji_model.summary()


def sentiment_query(word: str, most_n: int = 5, min_dist: float = None):
    return get_top_n_emojis(sentence_tokenizer,
                            deepmoji_model,
                            word,
                            most_n=most_n,
                            min_dist=min_dist)


sentiment_query("I lost my dog oh no")
Beispiel #14
0
def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
model.summary()

print('Running predictions.')
prob = model.predict(tokenized)

# Find top emojis for each sentence. Emoji ids (0-63)
# correspond to the mapping in emoji_overview.png
# at the root of the DeepMoji repo.
print('Writing results to {}'.format(OUTPUT_PATH))
scores = []
for i, t in enumerate(TEST_SENTENCES):
    t_tokens = tokenized[i]
    t_score = [t]
    t_prob = prob[i]
    ind_top = top_elements(t_prob, 5)
Beispiel #15
0
# model = torchmoji_feature_encoding(PRETRAINED_PATH, return_attention=True)
# print(model)
#
# print('Encoding texts..')
# encoding, att_weights = model(tokenized)
# att_weights = att_weights.cpu().data.numpy()


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


print('Loading emoji pred model from {}.'.format(PRETRAINED_PATH),
      file=sys.stdout)
model = deepmoji_emojis(maxlen, PRETRAINED_PATH, return_attention=True)
model.summary()
print('Running predictions.', file=sys.stdout)
prob, att_weights = model.predict(tokenized)

emojis = []
for prob in [prob]:
    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the torchMoji repo.
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        tmp = map(lambda x: EMOJIS[x], ind_top)
Beispiel #16
0
 def __init__(self, maxlen):
     self.model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
     self.maxlen = maxlen
def start(r, auth, keyword, max_items):
    api = tweepy.API(auth)
    para = ""

    happy_counter = 0
    sad_counter = 0
    fear_counter = 0
    angry_counter = 0
    love_counter = 0

    happy_buffer = []
    sad_buffer = []
    fear_buffer = []
    angry_buffer = []
    love_buffer = []

    happy_phrases = []
    sad_phrases = []
    fear_phrases = []
    angry_phrases = []
    love_phrases = []

    happy_para = ''
    sad_para = ''
    fear_para = ''
    angry_para = ''
    love_para = ''

    happy_location = []
    sad_location = []
    fear_location = []
    angry_location = []
    love_location = []

    def check_token(token):
        for i in class_tokens:
            if token in class_tokens[i]:
                return i
        return -1

    TEST_SENTENCES = []

    LOCATIONS = []

    for tweet in tweepy.Cursor(api.search,
                               q=keyword,
                               count=100,
                               lang='en',
                               include_entities=False,
                               tweet_mode='extended').items(max_items):

        location = tweet.user.location
        if not location:
            location = ""
        else:
            if "," in location:
                location = location[0:location.index(",")]

        location = location.strip()
        LOCATIONS.append(location)
        # print('Location :' , location)

        temp = tweet._json.get('full_text')

        if temp.startswith("RT"):
            try:
                temp = tweet._json.get('retweeted_status').get('full_text')
            except:
                temp = tweet._json.get('full_text')
        else:
            temp = tweet._json.get('full_text')

        temp = temp.replace("RT ", "").replace("!", "").replace(
            "..",
            "").replace("$", "").replace("%", "").replace("&", "").replace(
                "~",
                "").replace("-", "").replace("+", "").replace("#", "").replace(
                    "\\n", "").replace("\\", "").replace("|", "")

        temp = " ".join(filter(lambda x: x[0] != '@', temp.split()))
        temp = re.sub(r'https\S+', "", temp)
        temp = temp.strip()
        para = para + temp
        TEST_SENTENCES.append(temp)

    print('Locations :', LOCATIONS)
    r.extract_keywords_from_text(para)
    # r.get_ranked_phrases_with_scores()

    ranked_phrases = r.get_ranked_phrases()

    for i in range(0, len(ranked_phrases)):
        ranked_phrases[i] = ranked_phrases[i].replace(",", "").replace(
            "'", "").replace("(", "").replace(')',
                                              "").replace('.', "").replace(
                                                  '`', "").replace('!', "")

        ranked_phrases[i] = re.sub(' +', ' ', ranked_phrases[i]).strip()

    top_keywords = ranked_phrases[:]

    for i in range(0, len(ranked_phrases)):

        t1 = ranked_phrases[i].split()
        if len(t1) > 3:
            top_keywords.remove(ranked_phrases[i])

    # print(TEST_SENTENCES)

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    maxlen = 30
    batch_size = 32

    # print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    # print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    #model.summary()

    # print('Running predictions.')
    prob = model.predict(tokenized)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.
    # print('Writing results to {}'.format(OUTPUT_PATH))
    scores = []
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        t_score.append(sum(t_prob[ind_top]))
        t_score.append(ind_top)
        t_score.append([t_prob[ind] for ind in ind_top])
        t_score.append('' + LOCATIONS[i])
        scores.append(t_score)
    # print(t_score)

    # print('Scores skjdvbkjsdbvjk : ' , scores[0])

    for i, row in enumerate(scores):
        try:
            # print(row[0])
            # print('row 2')
            # print(row[2][0])

            # if (row[2] in class_tokens]
            temp = check_token(row[2][0])
            # print(temp)

            if temp == 'sad':
                sad_counter = 1 + sad_counter
                sad_buffer.append(row[0])
                sad_para = sad_para + row[0]
                sad_location.append(row[4])

            elif temp == 'happy':
                happy_counter = 1 + happy_counter
                # print("happy counter");
                # print(happy_counter);
                happy_buffer.append(row[0])
                happy_para = happy_para + row[0]
                happy_location.append(row[4])

            elif temp == 'fear':
                fear_counter = 1 + fear_counter
                fear_buffer.append(row[0])
                fear_para = fear_para + row[0]
                fear_location.append(row[4])

            elif temp == 'angry':
                angry_counter = 1 + angry_counter
                angry_buffer.append(row[0])
                angry_para = angry_para + row[0]
                angry_location.append(row[4])

            elif temp == 'love':
                love_counter = 1 + love_counter
                love_buffer.append(row[0])
                love_para = love_para + row[0]
                love_location.append(row[4])

        except Exception:
            pass
        # print("Exception at row {}!".format(i))

    # print("Angry buffer : " , angry_buffer)
    # print("Sad buffer : " , sad_buffer)

    r.extract_keywords_from_text(happy_para)
    happy_phrases = r.get_ranked_phrases()[0:3]

    r.extract_keywords_from_text(sad_para)
    sad_phrases = r.get_ranked_phrases()[0:3]

    r.extract_keywords_from_text(fear_para)
    fear_phrases = r.get_ranked_phrases()[0:3]

    r.extract_keywords_from_text(angry_para)
    angry_phrases = r.get_ranked_phrases()[0:3]

    r.extract_keywords_from_text(love_para)
    love_phrases = r.get_ranked_phrases()[0:3]

    # print("Phrases " , happy_phrases)
    # print("Angry Locations : " , angry_location)

    return happy_buffer, sad_buffer, fear_buffer, love_buffer, angry_buffer, happy_phrases, sad_phrases, fear_phrases, love_phrases, angry_phrases, happy_location, sad_location, fear_location, love_location, angry_location, top_keywords[:
                                                                                                                                                                                                                                             10]
Beispiel #18
0
"""
import sys
import os
from os.path import abspath, dirname
sys.path.insert(0, dirname(dirname(abspath(__file__))))

import json
import csv
import numpy as np
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.model_def import deepmoji_emojis
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

maxlen = 30
batch_size = 32
model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
model.summary()

with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, maxlen)

def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]

def model_predict(TEST_SENTENCES):
    print(TEST_SENTENCES)
    # print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
def analyse_text_chunk(text_chunk):
    OUTPUT_PATH = 'test_sentences.csv'
    json_file = 'test_sentences.json'

    TEST_SENTENCES = sent_tokenize(text_chunk)

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    maxlen = 30
    batch_size = 32

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized)

    print('Writing results to {}'.format(OUTPUT_PATH))
    scores = []
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        t_score.append(sum(t_prob[ind_top]))
        t_score.extend(ind_top)
        t_score.extend([t_prob[ind] for ind in ind_top])
        scores.append(t_score)
        #print(t_score)

    with open(OUTPUT_PATH, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', lineterminator='\n')
        writer.writerow([
            'Text', 'Top5%', 'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4',
            'Emoji_5', 'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'
        ])
        for i, row in enumerate(scores):
            try:
                writer.writerow(row)
            except Exception:
                print("Exception at row {}!".format(i))

    csv_rows = []
    with open(OUTPUT_PATH, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        title = reader.fieldnames
        for row in reader:
            csv_rows.extend(
                [{title[i]: row[title[i]]
                  for i in range(len(title))}])

    # Convert csv data into json and write it
    # format = 'pretty'
    # with open(json_file, "w") as f:
    #     if format == "pretty":
    #         f.write(json.dumps(csv_rows, sort_keys=False, indent=4, separators=(',', ': '),
    #                            ensure_ascii=False))
    #     else:
    #         f.write(json.dumps(data))

    return json.dumps(csv_rows,
                      sort_keys=False,
                      indent=4,
                      separators=(',', ': '),
                      ensure_ascii=False)