def test_dataset_split_explicit():
    """ Dataset is split according to given indices
    """
    split_parameter = [train_ind, val_ind, test_ind]
    st = SentenceTokenizer(vocab, 30)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    for i, sentence in enumerate(sentences):
        if i in train_ind:
            assert tokenized[i] in train
            assert dicts[i] in train_dicts
        elif i in val_ind:
            assert tokenized[i] in val
            assert dicts[i] in val_dicts
        elif i in test_ind:
            assert tokenized[i] in test
            assert dicts[i] in test_dicts

    assert len(train) == len(train_ind)
    assert len(val) == len(val_ind)
    assert len(test) == len(test_ind)
    assert len(train_dicts) == len(train_ind)
    assert len(val_dicts) == len(val_ind)
    assert len(test_dicts) == len(test_ind)
Beispiel #2
0
def load_vocab_prepare_data(sentences,vocab_path = VOCAB_PATH,maxlen =100):
	'''
		Va charger le vocabulaire de Deepmoji et preparer l'input du modèle à partir de l'embedding des textes
		INPUT : 
			vocab_path : Où trouver le vocabulaire, a priori on peut mettre ce que l'on veut mais on préfère garder le vocab ini
			maxlen : taille de l'espace de sortie pour chaque mots
		OUT: 
			renvoie le data set prêt à entrainer le modèle (stemmer + enlevement des stop words)
	'''
	stop_words = set(stopwords.words('english'))
	stmr = PorterStemmer()
	sentence_util = []
	for sentence in sentences:
		processed_sentence = []
		for word in sentence:
			if word not in stop_words and word not in string.punctuation:
				processed_sentence.append(stmr.stem(word))
		sentence_util.append(''.join(processed_sentence))


	print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
	with open(VOCAB_PATH, 'r') as f:
		vocabulary = json.load(f)
	st = SentenceTokenizer(vocabulary, maxlen)
	tokenized, _, _ = st.tokenize_sentences(sentence_util)
	return(tokenized)
def test_dataset_split_parameter():
    """ Dataset is split in the desired ratios
    """
    split_parameter = [0.7, 0.1, 0.2]
    st = SentenceTokenizer(vocab, 30)

    result, result_dicts, _ = st.split_train_val_test(sentences,
                                                      dicts,
                                                      split_parameter,
                                                      extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    assert len(train) == len(sentences) * split_parameter[0]
    assert len(val) == len(sentences) * split_parameter[1]
    assert len(test) == len(sentences) * split_parameter[2]

    assert len(train_dicts) == len(dicts) * split_parameter[0]
    assert len(val_dicts) == len(dicts) * split_parameter[1]
    assert len(test_dicts) == len(dicts) * split_parameter[2]
def main():
    df = pd.read_csv('../data/interim/sentences.csv')

    maxlen = 30
    batch_size = 32

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    sentences = []
    for sent in df.body.tolist():
        sent = unicode(str(sent), "utf-8")
        if sent.strip() == "":
            sent = 'blank'
            sent = unicode(str(sent), "utf-8")
        sentences.append(sent)

    tokenized, _, _ = st.tokenize_sentences(sentences)

    # generate full deepmoji features for sentences
    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Encoding texts with deepmoji features...')
    encoding = model.predict(tokenized)

    deepmoji_encodings = pd.DataFrame(encoding)
    deepmoji_encodings.index = df.post_id

    deepmoji_post_scores = deepmoji_encodings.groupby('post_id').agg(
        ['mean', 'max', 'min'])
    deepmoji_post_scores = flatten_cols(deepmoji_post_scores)
    deepmoji_post_scores = deepmoji_post_scores.add_prefix('deepmoji_')

    # generate 64 emoji encodings
    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running emoji predictions...')
    prob = model.predict(tokenized)
    emoji_scores = pd.DataFrame(prob)
    emoji_scores = emoji_scores.add_prefix('emoji_')
    emoji_scores.index = df.post_id

    emoji_post_scores = emoji_scores.groupby('post_id').agg(
        ['mean', 'max', 'min'])
    emoji_post_scores = flatten_cols(emoji_post_scores)

    print('deepmoji features shape: {}'.format(deepmoji_post_scores.shape))
    print('emoji features shape: {}'.format(emoji_post_scores.shape))
    total_feats = deepmoji_post_scores.merge(emoji_post_scores,
                                             left_index=True,
                                             right_index=True)
    print('total features shape: {}'.format(total_feats.shape))
    total_feats.to_csv('../data/interim/all_sent_level_deepmoji.csv')
Beispiel #5
0
    def generate_emoji(self, text):
        translator = Translator(from_lang="chinese", to_lang="english")
        translation = translator.translate(text)

        TEST_SENTENCES = [translation]

        def top_elements(array, k):
            ind = np.argpartition(array, -k)[-k:]
            return ind[np.argsort(array[ind])][::-1]

        self.maxlen = 30
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)
        st = SentenceTokenizer(vocabulary, self.maxlen)
        tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
        prob = self.model.predict(tokenized)
        t_tokens = tokenized[0]
        t_score = []
        t_prob = prob[0]
        ind_top = top_elements(t_prob, 5)
        t_score.append(sum(t_prob[ind_top]))
        t_score.append(ind_top[0])
        print(t_score[1])

        if (t_score[0] > 0.5):
            return (mylabel[t_score[1]])
        else:
            return ('low')
Beispiel #6
0
def scoreTexts(TEST_SENTENCES):
    global vocabulary, model

    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    if model == None:
        model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
        model.summary()

    prob = model.predict(tokenized)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.
    scores = []
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = {}
        t_score["text"] = t
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        #t_score["prob"]=sum(t_prob[ind_top])

        emoji_score = {}
        for ind in ind_top:
            emoji_score[ind] = t_prob[ind]
        t_score["score"] = emoji_score
        scores.append(t_score)
    return scores
    def predict(self, sentence):
        sentence_to_analyze = [sentence]

        print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)
        st = SentenceTokenizer(vocabulary, self.maxlen)
        tokenized, _, _ = st.tokenize_sentences(sentence_to_analyze)

        print('Running predictions.')
        prob = self.model.predict(tokenized)

        # Find top emojis for each sentence. Emoji ids (0-63)
        # correspond to the mapping in emoji_overview.png
        # at the root of the DeepMoji repo.
        scores = []
        for i, t in enumerate(sentence_to_analyze):
            t_tokens = tokenized[i]
            t_score = [t]
            t_prob = prob[i]
            ind_top = self.top_elements(t_prob, 5)
            ind_top_unicode = []
            for index in ind_top:
                unicode = self.mapping[index]
                ind_top_unicode.append(unicode)
            print(ind_top)
            t_score.append(sum(t_prob[ind_top]))
            t_score.extend(ind_top)
            t_score.extend([t_prob[ind] for ind in ind_top])
            scores.append(t_score)
            print(t_score)
            return ind_top_unicode
Beispiel #8
0
def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = [
        u'I love mom\'s cooking', u'I love how you never reply back..',
        u'I love cruising with my homies', u'I love messing with yo mind!!',
        u'I love you and now you\'re just gone..', u'This is shit',
        u'This is the shit'
    ]

    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)

    encoding = model.predict(tokenized)
    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences,
                       np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
Beispiel #9
0
def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = [u'I love mom\'s cooking',
                      u'I love how you never reply back..',
                      u'I love cruising with my homies',
                      u'I love messing with yo mind!!',
                      u'I love you and now you\'re just gone..',
                      u'This is shit',
                      u'This is the shit']

    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)

    encoding = model.predict(tokenized)
    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
Beispiel #10
0
def load_benchmark(path, vocab, extend_with=0):
    """ Loads the given benchmark dataset.

        Tokenizes the texts using the provided vocabulary, extending it with
        words from the training dataset if extend_with > 0. Splits them into
        three lists: training, validation and testing (in that order).

        Also calculates the maximum length of the texts and the
        suggested batch_size.

    # Arguments:
        path: Path to the dataset to be loaded.
        vocab: Vocabulary to be used for tokenizing texts.
        extend_with: If > 0, the vocabulary will be extended with up to
            extend_with tokens from the training set before tokenizing.

    # Returns:
        A dictionary with the following fields:
            texts: List of three lists, containing tokenized inputs for
                training, validation and testing (in that order).
            labels: List of three lists, containing labels for training,
                validation and testing (in that order).
            added: Number of tokens added to the vocabulary.
            batch_size: Batch size.
            maxlen: Maximum length of an input.
    """

    # Pre-processing dataset
    with open(path, 'rb') as dataset:
        data = dataset.readlines()
        # data = pickle.load(dataset, fix_imports=True)

    # Decode data
    try:
        texts = [str(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    # Extract labels
    labels = [x['label'] for x in data['info']]
    print('This is the labels', type(labels), labels)

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(
        texts,
        labels, [data['train_ind'], data['val_ind'], data['test_ind']],
        extend_with=extend_with)
    return {
        'texts': texts,
        'labels': labels,
        'added': added,
        'batch_size': batch_size,
        'maxlen': maxlen
    }
def test_id_to_sentence():
    """Tokenizing and converting back preserves the input.
    """
    vb = {'CUSTOM_MASK': 0, 'aasdf': 1000, 'basdf': 2000}

    sentence = u'aasdf basdf basdf basdf'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == sentence
def test_id_to_sentence_with_unknown():
    """Tokenizing and converting back preserves the input, except for unknowns.
    """
    vb = {'CUSTOM_MASK': 0, 'CUSTOM_UNKNOWN': 1, 'aasdf': 1000, 'basdf': 2000}

    sentence = u'aasdf basdf ccc'
    expected = u'aasdf basdf CUSTOM_UNKNOWN'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == expected
Beispiel #13
0
 def initialize(self):
     deepmoji_weights_path = os.path.join(self.model_path,
                                          'deepmoji_weights.hdf5')
     vocabulary_path = os.path.join(self.model_path, 'vocabulary.json')
     with open(vocabulary_path, 'r') as f:
         vocab = json.load(f)
     self._st_ = SentenceTokenizer(vocab, self.max_len)
     self._model_ = deepmoji_feature_encoding(self.max_len,
                                              deepmoji_weights_path,
                                              self.return_attention)
Beispiel #14
0
def process_text(df, vocab):
    """ Tokenizes the text for predictions """
    try:
        texts =  [unicode(x) for x in df['text']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in df['text']]
    
    st = SentenceTokenizer(vocab, 30)
    tokenized, _, _ = st.tokenize_sentences(texts)

    return tokenized
def test_id_to_sentence():
    """Tokenizing and converting back preserves the input.
    """
    vb = {'CUSTOM_MASK': 0,
          'aasdf': 1000,
          'basdf': 2000}

    sentence = u'aasdf basdf basdf basdf'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == sentence
def test_id_to_sentence_with_unknown():
    """Tokenizing and converting back preserves the input, except for unknowns.
    """
    vb = {'CUSTOM_MASK': 0,
          'CUSTOM_UNKNOWN': 1,
          'aasdf': 1000,
          'basdf': 2000}

    sentence = u'aasdf basdf ccc'
    expected = u'aasdf basdf CUSTOM_UNKNOWN'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == expected
Beispiel #17
0
def model_deep(language):
    maxlen = 30
    batch_size = 32
    #list_new = []
    #list_new.append(language)
    answer = [unicode(item) for item in language]
    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(answer)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.
    print('Writing results to {}'.format(OUTPUT_PATH))
    scores = []
    for i, t in enumerate(answer):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        t_score.append(sum(t_prob[ind_top]))
        t_score.extend(ind_top)
        t_score.extend([t_prob[ind] for ind in ind_top])
        scores.append(t_score)
        print(t_score)

    with open(OUTPUT_PATH, 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', lineterminator='\n')
        writer.writerow([
            'Text', 'Top5%', 'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4',
            'Emoji_5', 'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'
        ])
        for i, row in enumerate(scores):
            try:
                writer.writerow(row)
            except Exception:
                print("Exception at row {}!".format(i))

    print(scores)
    return ''.join(str(e) for e in scores)
Beispiel #18
0
def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(
        texts,
        labels, [data['train_ind'], data['val_ind'], data['test_ind']],
        extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                         dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))
def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(texts,
                                                  labels,
                                                  [data['train_ind'],
                                                   data['val_ind'],
                                                   data['test_ind']],
                                                  extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                         dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))
Beispiel #20
0
def predict_emoji(training_data, maxlen):
    '''
    predicts the emojis commonly associated with the sentences then adds it to the
    :param sentences: list of sentences to predict
    :param maxlen: max length of the setences given
    :return:
    '''
    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    sentences = training_data['sentence']

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized, batch_size=500)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.
    # print('Writing results to {}'.format(OUTPUT_PATH))
    # scores = []
    # for i, t in enumerate(sentences):
    #     t_tokens = tokenized[i]
    #     t_score = [t]
    #     t_prob = prob[i]
    #     ind_top = top_elements(t_prob, 5)
    #     t_score.append(sum(t_prob[ind_top]))
    #     t_score.extend(ind_top)
    #     t_score.extend([t_prob[ind] for ind in ind_top])
    #     scores.append(t_score)
    #     print(t_score)

    return prob
Beispiel #21
0
def emoji_predict(sen_list,
                  maxlen=30,
                  step=32,
                  model_path='../model/deepmoji_weights.hdf5',
                  vocab_path='../model/vocabulary.json'):
    model = deepmoji_emojis(maxlen, model_path)
    model.summary()

    with open(vocab_path, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary,
                           maxlen,
                           ignore_sentences_with_only_custom=True)
    records = []

    for i in range(0, len(sen_list), step):
        if i + step >= len(sen_list):
            tokenized, _, _ = st.tokenize_sentences(sen_list[i:len(sen_list)])
            content = sen_list[i:len(sen_list)]
            if len(tokenized) != len(content):
                print('Skip ' + str(i))
                continue
        else:
            tokenized, _, _ = st.tokenize_sentences(sen_list[i:i + step])
            content = sen_list[i:i + step]
            if len(tokenized) != len(content):
                print('Skip ' + str(i))
                continue
        prob = model.predict(tokenized)
        for j in range(len(content)):
            r = {}
            r['text'] = [content[j]]
            t_prob = prob[j]
            ind_top = top_elements(t_prob, 5)
            r['confidence'] = (str(sum(t_prob[ind_top])))
            r['top5emoji'] = [unicode(emoji_list[ind]) for ind in ind_top]
            r['top5prob'] = [str(t_prob[ind]) for ind in ind_top]
            r['prob'] = [str(num) for num in t_prob]
            records.append(r)
        if i % 1024 == 0:
            print('Processing: ' + str(i) + '/' + str(len(sen_list)))

    return records
Beispiel #22
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        u'I love mom\'s cooking',
        u'I love how you never reply back..',
        u'I love cruising with my homies',
        u'I love messing with yo mind!!',
        u'I love you and now you\'re just gone..',
        u'This is shit',
        u'This is the shit'
    ]

    expected = [
        np.array([36,  4,  8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31,  6, 30, 15, 13]),
        np.array([54, 44,  9, 50, 49]),
        np.array([46,  5, 27, 35, 34]),
        np.array([55, 32, 27,  1, 37]),
        np.array([48, 11,  6, 31,  9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokenized, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = deepmoji_emojis(maxlen=30, weight_path=PRETRAINED_PATH)
    prob = model.predict(tokenized)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(prob):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
Beispiel #23
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        u'I love mom\'s cooking',
        u'I love how you never reply back..',
        u'I love cruising with my homies',
        u'I love messing with yo mind!!',
        u'I love you and now you\'re just gone..',
        u'This is shit',
        u'This is the shit'
    ]

    expected = [
        np.array([36, 4, 8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31, 6, 30, 15, 13]),
        np.array([54, 44, 9, 50, 49]),
        np.array([46, 5, 27, 35, 34]),
        np.array([55, 32, 27, 1, 37]),
        np.array([48, 11, 6, 31, 9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokenized, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = deepmoji_emojis(maxlen=30, weight_path=PRETRAINED_PATH)
    prob = model.predict(tokenized)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(prob):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
def emoticonit(sen):
    TEST_SENTENCES = [unicode(sen)]

    maxlen = 30
    batch_size = 32

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
        st = SentenceTokenizer(vocabulary, maxlen)
        tokenized = st.tokenize_sentences(TEST_SENTENCES)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.

    scores = []
    selected = []
    num = 1
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, num)
        t_score.append(sum(t_prob[ind_top]))
        t_score.extend(ind_top)
        ind = ind_top.tolist()  #list
        for i in range(num):
            print(emoticons[ind[i]])
            selected.append(emoticons[ind[i]])
        t_score.extend([t_prob[ind] for ind in ind_top])
        scores.append(t_score)
        print(t_score)
    return (selected)
Beispiel #25
0
def get_texts_sentiment(texts, model_ensemble):
    """
    Get sentiment scores for list of texts
    :param texts:
    :param model_ensemble:
    :return: average_sentiment_prediction
    """
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    twitter_maxlen = 30
    youtube_maxlen = 30

    twitter_st = SentenceTokenizer(vocabulary, twitter_maxlen)
    youtube_st = SentenceTokenizer(vocabulary, youtube_maxlen)

    twitter_tokenized, _, _ = twitter_st.tokenize_sentences(texts)
    youtube_tokenized, _, _ = youtube_st.tokenize_sentences(texts)

    twitter_predictions = model_ensemble[0].predict(twitter_tokenized)
    youtube_predictions = model_ensemble[1].predict(youtube_tokenized)

    average_predictions = (twitter_predictions + youtube_predictions) / 2
    average_sentiment_prediction = [
        modify_range(prediction)[0] for prediction in average_predictions
    ]

    return average_sentiment_prediction
def test_dataset_split_parameter():
    """ Dataset is split in the desired ratios
    """
    split_parameter = [0.7, 0.1, 0.2]
    st = SentenceTokenizer(vocab, 30)

    result, result_dicts, _ = st.split_train_val_test(sentences, dicts,
                                                      split_parameter, extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    assert len(train) == len(sentences) * split_parameter[0]
    assert len(val) == len(sentences) * split_parameter[1]
    assert len(test) == len(sentences) * split_parameter[2]

    assert len(train_dicts) == len(dicts) * split_parameter[0]
    assert len(val_dicts) == len(dicts) * split_parameter[1]
    assert len(test_dicts) == len(dicts) * split_parameter[2]
Beispiel #27
0
def use_deepmoji(maxlen=MAXLEN,
                 vocab_path=DEEPMOJI_VOCAB_FILE,
                 weights_path=DEEPMOJI_WEIGHT_FILE):
    print('Tokenizing using dictionary from {}'.format(vocab_path))
    with open(vocab_path, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(weights_path))
    model = deepmoji_feature_encoding(maxlen, weights_path)
    model.summary()

    return st, model
Beispiel #28
0
def predict_emoji(training_data, maxlen):
    '''
    predicts the emojis commonly associated with the sentences then adds it to the
    :param sentences: list of sentences to predict
    :param maxlen: max length of the setences given
    :return:
    '''

    sentences = training_data

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running predictions.')
    prob = model.predict(tokenized, batch_size=100)

    return prob
def test_dataset_split_explicit():
    """ Dataset is split according to given indices
    """
    split_parameter = [train_ind, val_ind, test_ind]
    st = SentenceTokenizer(vocab, 30)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    result, result_dicts, added = st.split_train_val_test(sentences,
                                                          dicts,
                                                          split_parameter,
                                                          extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    for i, sentence in enumerate(sentences):
        if i in train_ind:
            assert tokenized[i] in train
            assert dicts[i] in train_dicts
        elif i in val_ind:
            assert tokenized[i] in val
            assert dicts[i] in val_dicts
        elif i in test_ind:
            assert tokenized[i] in test
            assert dicts[i] in test_dicts

    assert len(train) == len(train_ind)
    assert len(val) == len(val_ind)
    assert len(test) == len(test_ind)
    assert len(train_dicts) == len(train_ind)
    assert len(val_dicts) == len(val_ind)
    assert len(test_dicts) == len(test_ind)
def load_non_benchmark(data, vocab, extend_with=0):
    # Decode data
    try:
        texts = [x for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    # Extract labels
    labels = [x['label'] for x in data['info']]

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(texts,
                                                   labels,
                                                   extend_with=extend_with)
    return {'texts': texts,
            'labels': labels,
            'added': added,
            'batch_size': batch_size,
            'maxlen': maxlen}
Beispiel #31
0
class DeepMojiTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, model_path, return_attention=False, max_len=10):
        self.model_path = model_path
        self.return_attention = return_attention
        self.max_len = max_len
        self.initialize()

    def initialize(self):
        deepmoji_weights_path = os.path.join(self.model_path,
                                             'deepmoji_weights.hdf5')
        vocabulary_path = os.path.join(self.model_path, 'vocabulary.json')
        with open(vocabulary_path, 'r') as f:
            vocab = json.load(f)
        self._st_ = SentenceTokenizer(vocab, self.max_len)
        self._model_ = deepmoji_feature_encoding(self.max_len,
                                                 deepmoji_weights_path,
                                                 self.return_attention)

    def fit(self, X, *_):
        return self

    def transform(self, X, *_):
        tokens, _, _ = self._st_.tokenize_sentences(X)
        vecs = self._model_.predict(tokens)
        if self.return_attention:
            return vecs[1]
        return vecs

    def fit_transform(self, X, y=None, **fit_params):
        return self.transform(X)

    def __setstate__(self, state):
        self.model_path = state['model_path']
        self.max_len = state['max_len']
        self.return_attention = state['return_attention']
        self.initialize()

    def __getstate__(self):
        return {
            'model_path': self.model_path,
            'max_len': self.max_len,
            'return_attention': self.return_attention,
        }
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.model_def import deepmoji_feature_encoding
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

test='I like!!!wow it'
test1='i like it!'
TEST_SENTENCES = [unicode(test, "utf-8"),unicode(test1, "utf-8")]

maxlen = 30
batch_size = 32



'''
        if len(vocabulary) > np.iinfo('uint16').max:
            raise ValueError('Dictionary is too big ({} tokens) for the numpy '
                             'datatypes used (max limit={}). Reduce vocabulary'
                             ' or adjust code accordingly!'
                             .format(len(vocabulary), np.iinfo('uint16').max))


'''



with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, 10)
tokenized= st.tokenize_sentences(TEST_SENTENCES)
print(tokenized)
Beispiel #33
0
"""
Take a given list of sentences and turn it into a numpy array, where each
number corresponds to a word. Padding is used (number 0) to ensure fixed length
of sentences.
"""

from __future__ import print_function

import json

from deepmoji.sentence_tokenizer import SentenceTokenizer

with open('../model/vocabulary.json', 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 30)
test_sentences = [
    u'\u2014 -- \u203c !!\U0001F602',
    u'Hello world!',
    u'This is a sample tweet #example',
]

tokens, infos, stats = st.tokenize_sentences(test_sentences)

print(tokens)
print(infos)
print(stats)
Beispiel #34
0
    u'I am sentence 7',
    u'I am sentence 8',
    u'I am sentence 9 newword',
]

INFO_DICTS = [
    {'label': 'sentence 0'},
    {'label': 'sentence 1'},
    {'label': 'sentence 2'},
    {'label': 'sentence 3'},
    {'label': 'sentence 4'},
    {'label': 'sentence 5'},
    {'label': 'sentence 6'},
    {'label': 'sentence 7'},
    {'label': 'sentence 8'},
    {'label': 'sentence 9'},
]

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)
st = SentenceTokenizer(vocab, 30)

# Split using the default split ratio
print(st.split_train_val_test(DATASET, INFO_DICTS))

# Split explicitly
print(st.split_train_val_test(DATASET,
                              INFO_DICTS,
                              [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
                              extend_with=1))
Beispiel #35
0
                  u'This is shit',
                  u'This is the shit']


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
model.summary()

print('Running predictions.')
prob = model.predict(tokenized)

# Find top emojis for each sentence. Emoji ids (0-63)
# correspond to the mapping in emoji_overview.png
# at the root of the DeepMoji repo.
print('Writing results to {}'.format(OUTPUT_PATH))
scores = []
for i, t in enumerate(TEST_SENTENCES):
Beispiel #36
0
import json
import csv
import numpy as np
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.model_def import deepmoji_emojis
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

maxlen = 30
batch_size = 32
model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
model.summary()

with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, maxlen)

def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]

def model_predict(TEST_SENTENCES):
    print(TEST_SENTENCES)
    # print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
    print(tokenized)
    # print('Loading model from {}.'.format(PRETRAINED_PATH))
    prob = model.predict(tokenized)
    # prob = model.predict(TEST_SENTENCES)
    return prob
def start(r, auth, keyword, max_items):
    api = tweepy.API(auth)
    para = ""

    happy_counter = 0
    sad_counter = 0
    fear_counter = 0
    angry_counter = 0
    love_counter = 0

    happy_buffer = []
    sad_buffer = []
    fear_buffer = []
    angry_buffer = []
    love_buffer = []

    happy_phrases = []
    sad_phrases = []
    fear_phrases = []
    angry_phrases = []
    love_phrases = []

    happy_para = ''
    sad_para = ''
    fear_para = ''
    angry_para = ''
    love_para = ''

    happy_location = []
    sad_location = []
    fear_location = []
    angry_location = []
    love_location = []

    def check_token(token):
        for i in class_tokens:
            if token in class_tokens[i]:
                return i
        return -1

    TEST_SENTENCES = []

    LOCATIONS = []

    for tweet in tweepy.Cursor(api.search,
                               q=keyword,
                               count=100,
                               lang='en',
                               include_entities=False,
                               tweet_mode='extended').items(max_items):

        location = tweet.user.location
        if not location:
            location = ""
        else:
            if "," in location:
                location = location[0:location.index(",")]

        location = location.strip()
        LOCATIONS.append(location)
        # print('Location :' , location)

        temp = tweet._json.get('full_text')

        if temp.startswith("RT"):
            try:
                temp = tweet._json.get('retweeted_status').get('full_text')
            except:
                temp = tweet._json.get('full_text')
        else:
            temp = tweet._json.get('full_text')

        temp = temp.replace("RT ", "").replace("!", "").replace(
            "..",
            "").replace("$", "").replace("%", "").replace("&", "").replace(
                "~",
                "").replace("-", "").replace("+", "").replace("#", "").replace(
                    "\\n", "").replace("\\", "").replace("|", "")

        temp = " ".join(filter(lambda x: x[0] != '@', temp.split()))
        temp = re.sub(r'https\S+', "", temp)
        temp = temp.strip()
        para = para + temp
        TEST_SENTENCES.append(temp)

    print('Locations :', LOCATIONS)
    r.extract_keywords_from_text(para)
    # r.get_ranked_phrases_with_scores()

    ranked_phrases = r.get_ranked_phrases()

    for i in range(0, len(ranked_phrases)):
        ranked_phrases[i] = ranked_phrases[i].replace(",", "").replace(
            "'", "").replace("(", "").replace(')',
                                              "").replace('.', "").replace(
                                                  '`', "").replace('!', "")

        ranked_phrases[i] = re.sub(' +', ' ', ranked_phrases[i]).strip()

    top_keywords = ranked_phrases[:]

    for i in range(0, len(ranked_phrases)):

        t1 = ranked_phrases[i].split()
        if len(t1) > 3:
            top_keywords.remove(ranked_phrases[i])

    # print(TEST_SENTENCES)

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    maxlen = 30
    batch_size = 32

    # print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    # print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    #model.summary()

    # print('Running predictions.')
    prob = model.predict(tokenized)

    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the DeepMoji repo.
    # print('Writing results to {}'.format(OUTPUT_PATH))
    scores = []
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        t_score.append(sum(t_prob[ind_top]))
        t_score.append(ind_top)
        t_score.append([t_prob[ind] for ind in ind_top])
        t_score.append('' + LOCATIONS[i])
        scores.append(t_score)
    # print(t_score)

    # print('Scores skjdvbkjsdbvjk : ' , scores[0])

    for i, row in enumerate(scores):
        try:
            # print(row[0])
            # print('row 2')
            # print(row[2][0])

            # if (row[2] in class_tokens]
            temp = check_token(row[2][0])
            # print(temp)

            if temp == 'sad':
                sad_counter = 1 + sad_counter
                sad_buffer.append(row[0])
                sad_para = sad_para + row[0]
                sad_location.append(row[4])

            elif temp == 'happy':
                happy_counter = 1 + happy_counter
                # print("happy counter");
                # print(happy_counter);
                happy_buffer.append(row[0])
                happy_para = happy_para + row[0]
                happy_location.append(row[4])

            elif temp == 'fear':
                fear_counter = 1 + fear_counter
                fear_buffer.append(row[0])
                fear_para = fear_para + row[0]
                fear_location.append(row[4])

            elif temp == 'angry':
                angry_counter = 1 + angry_counter
                angry_buffer.append(row[0])
                angry_para = angry_para + row[0]
                angry_location.append(row[4])

            elif temp == 'love':
                love_counter = 1 + love_counter
                love_buffer.append(row[0])
                love_para = love_para + row[0]
                love_location.append(row[4])

        except Exception:
            pass
        # print("Exception at row {}!".format(i))

    # print("Angry buffer : " , angry_buffer)
    # print("Sad buffer : " , sad_buffer)

    r.extract_keywords_from_text(happy_para)
    happy_phrases = r.get_ranked_phrases()[0:3]

    r.extract_keywords_from_text(sad_para)
    sad_phrases = r.get_ranked_phrases()[0:3]

    r.extract_keywords_from_text(fear_para)
    fear_phrases = r.get_ranked_phrases()[0:3]

    r.extract_keywords_from_text(angry_para)
    angry_phrases = r.get_ranked_phrases()[0:3]

    r.extract_keywords_from_text(love_para)
    love_phrases = r.get_ranked_phrases()[0:3]

    # print("Phrases " , happy_phrases)
    # print("Angry Locations : " , angry_location)

    return happy_buffer, sad_buffer, fear_buffer, love_buffer, angry_buffer, happy_phrases, sad_phrases, fear_phrases, love_phrases, angry_phrases, happy_location, sad_location, fear_location, love_location, angry_location, top_keywords[:
                                                                                                                                                                                                                                             10]
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

TEST_SENTENCES = [
    u'I love mom\'s cooking', u'I love how you never reply back..',
    u'I love cruising with my homies', u'I love messing with yo mind!!',
    u'I love you and now you\'re just gone..', u'This is shit',
    u'This is the shit'
]

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
model.summary()

print('Encoding texts..')
encoding = model.predict(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print(encoding[0, :5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.
Beispiel #39
0
print ("len(TEST_SENTENCES):",len(TEST_SENTENCES))
print ("TEST_SENTENCES[1]:",TEST_SENTENCES[1])


##################################################################
def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)#FROM LIST OF TOKENS TO NUMBERS

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
model.summary()

print('Running predictions.')
prob = model.predict(tokenized)
####################################################################prob[] is the softmax output for 64 emojis
# Find top emojis for each sentence. Emoji ids (0-63)
# correspond to the mapping in emoji_overview.png 
# at the root of the DeepMoji repo.
'''
print('Writing results to {}'.format(OUTPUT_PATH))