コード例 #1
0
ファイル: test_finetuning.py プロジェクト: Dobatymo/DeepMoji
def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = [u'I love mom\'s cooking',
                      u'I love how you never reply back..',
                      u'I love cruising with my homies',
                      u'I love messing with yo mind!!',
                      u'I love you and now you\'re just gone..',
                      u'This is shit',
                      u'This is the shit']

    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)

    encoding = model.predict(tokenized)
    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
コード例 #2
0
def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = [
        u'I love mom\'s cooking', u'I love how you never reply back..',
        u'I love cruising with my homies', u'I love messing with yo mind!!',
        u'I love you and now you\'re just gone..', u'This is shit',
        u'This is the shit'
    ]

    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

    model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)

    encoding = model.predict(tokenized)
    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences,
                       np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
コード例 #3
0
def main():
    df = pd.read_csv('../data/interim/sentences.csv')

    maxlen = 30
    batch_size = 32

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    sentences = []
    for sent in df.body.tolist():
        sent = unicode(str(sent), "utf-8")
        if sent.strip() == "":
            sent = 'blank'
            sent = unicode(str(sent), "utf-8")
        sentences.append(sent)

    tokenized, _, _ = st.tokenize_sentences(sentences)

    # generate full deepmoji features for sentences
    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Encoding texts with deepmoji features...')
    encoding = model.predict(tokenized)

    deepmoji_encodings = pd.DataFrame(encoding)
    deepmoji_encodings.index = df.post_id

    deepmoji_post_scores = deepmoji_encodings.groupby('post_id').agg(
        ['mean', 'max', 'min'])
    deepmoji_post_scores = flatten_cols(deepmoji_post_scores)
    deepmoji_post_scores = deepmoji_post_scores.add_prefix('deepmoji_')

    # generate 64 emoji encodings
    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
    model.summary()

    print('Running emoji predictions...')
    prob = model.predict(tokenized)
    emoji_scores = pd.DataFrame(prob)
    emoji_scores = emoji_scores.add_prefix('emoji_')
    emoji_scores.index = df.post_id

    emoji_post_scores = emoji_scores.groupby('post_id').agg(
        ['mean', 'max', 'min'])
    emoji_post_scores = flatten_cols(emoji_post_scores)

    print('deepmoji features shape: {}'.format(deepmoji_post_scores.shape))
    print('emoji features shape: {}'.format(emoji_post_scores.shape))
    total_feats = deepmoji_post_scores.merge(emoji_post_scores,
                                             left_index=True,
                                             right_index=True)
    print('total features shape: {}'.format(total_feats.shape))
    total_feats.to_csv('../data/interim/all_sent_level_deepmoji.csv')
コード例 #4
0
ファイル: deep_moji.py プロジェクト: ysenarath/opinion-lab
 def initialize(self):
     deepmoji_weights_path = os.path.join(self.model_path,
                                          'deepmoji_weights.hdf5')
     vocabulary_path = os.path.join(self.model_path, 'vocabulary.json')
     with open(vocabulary_path, 'r') as f:
         vocab = json.load(f)
     self._st_ = SentenceTokenizer(vocab, self.max_len)
     self._model_ = deepmoji_feature_encoding(self.max_len,
                                              deepmoji_weights_path,
                                              self.return_attention)
コード例 #5
0
def use_deepmoji(maxlen=MAXLEN,
                 vocab_path=DEEPMOJI_VOCAB_FILE,
                 weights_path=DEEPMOJI_WEIGHT_FILE):
    print('Tokenizing using dictionary from {}'.format(vocab_path))
    with open(vocab_path, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(weights_path))
    model = deepmoji_feature_encoding(maxlen, weights_path)
    model.summary()

    return st, model
コード例 #6
0
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

TEST_SENTENCES = [
    u'I love mom\'s cooking', u'I love how you never reply back..',
    u'I love cruising with my homies', u'I love messing with yo mind!!',
    u'I love you and now you\'re just gone..', u'This is shit',
    u'This is the shit'
]

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
model.summary()

print('Encoding texts..')
encoding = model.predict(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print(encoding[0, :5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.
コード例 #7
0
ファイル: encode_texts.py プロジェクト: Dobatymo/DeepMoji
TEST_SENTENCES = [u'I love mom\'s cooking',
                  u'I love how you never reply back..',
                  u'I love cruising with my homies',
                  u'I love messing with yo mind!!',
                  u'I love you and now you\'re just gone..',
                  u'This is shit',
                  u'This is the shit']

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
model.summary()

print('Encoding texts..')
encoding = model.predict(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print(encoding[0, :5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.