Esempio n. 1
0
def test_torchmoji_return_attention():
    seq_tensor = np.array([[1]])
    # test the output of the normal model
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    # check correct number of outputs
    assert len(model(seq_tensor)) == 1
    # repeat above described tests when returning attention weights
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH, return_attention=True)
    assert len(model(seq_tensor)) == 2
Esempio n. 2
0
def test_torchmoji_return_attention():
    seq_tensor = np.array([[1]])
    # test the output of the normal model
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    # check correct number of outputs
    assert len(model(seq_tensor)) == 1
    # repeat above described tests when returning attention weights
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH,
                             return_attention=True)
    assert len(model(seq_tensor)) == 2
async def predict_sentence_emojis(sentence: str,
                                  num_to_predict: int = 5) -> dict:
    """
    Predict top n emojis based on the sentence
    :param sentence: sentence used in prediction
    :param num_to_predict: number of top emojis to return
    :return: Dictionary where key is predicted emoji and value is its probability
    """

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, MAXLEN)

    model = torchmoji_emojis(PRETRAINED_PATH)
    print('Running predictions.')
    tokenized, _, _ = st.tokenize_sentences([sentence])
    prob = model(tokenized)[0]

    ind_top = top_elements(prob, num_to_predict)
    emojis = list(map(lambda x: EMOJIS[x], ind_top))

    # Might be useful if we need to send it this way
    # emojis_unicode_escape = [unicode_codes.EMOJI_ALIAS_UNICODE[emoj].encode('unicode-escape') for emoj in emojis]

    emojis_unicode = [
        unicode_codes.EMOJI_ALIAS_UNICODE[emoj] for emoj in emojis
    ]
    return dict(zip(emojis_unicode, prob[ind_top]))
Esempio n. 4
0
def init_tokenizer_emotions(max_len):
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, max_len)
    model = torchmoji_emojis(PRETRAINED_PATH)

    return st, model
Esempio n. 5
0
def get_emotion_features_from_text(text, audio_filename):
    # https://github.com/huggingface/torchMoji/blob/master/examples/score_texts_emojis.py

    if text == '':
        emoji_ids = []
        one_hot_encodings = []
    else:
        text = [text]

        def top_elements(array, k):
            ind = np.argpartition(array, -k)[-k:]
            return ind[np.argsort(array[ind])][::-1]

        maxlen = 30

        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        st = SentenceTokenizer(vocabulary, maxlen)

        model = torchmoji_emojis(PRETRAINED_PATH)
        tokenized, _, _ = st.tokenize_sentences(text)
        prob = model(tokenized)

        for prob in [prob]:
            # Find top emojis for each sentence. Emoji ids (0-63)
            # correspond to the mapping in emoji_overview.png
            # at the root of the torchMoji repo.
            scores = []
            for i, t in enumerate(text):
                t_tokens = tokenized[i]
                t_score = [t]
                t_prob = prob[i]
                ind_top = top_elements(t_prob, 5)
                t_score.append(sum(t_prob[ind_top]))
                t_score.extend(ind_top)
                t_score.extend([t_prob[ind] for ind in ind_top])
                scores.append(t_score)

        emoji_ids = scores[0][2:2 + 5]
        one_hot_encodings = []
        for emoji_idx in emoji_ids:
            one_hot_encodings.append(
                [0 if i != emoji_idx else 1 for i in range(64)])

    a = audio_filename.split('/')

    filename = '/' + '/'.join(
        a[1:-1]) + '/onehot_emotion_' + a[-1].split('.wav')[0] + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(one_hot_encodings, f)

    filename = '/' + '/'.join(
        a[1:-1]) + '/emoji_ids_' + a[-1].split('.wav')[0] + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(emoji_ids, f)

    return emoji_ids, one_hot_encodings
Esempio n. 6
0
 def __init__(self):
     # Tokenizing using dictionary
     with open(VOCAB_PATH, 'r') as f:
         vocabulary = json.load(f)
     self.st = SentenceTokenizer(vocabulary, 30)
     # Loading model
     self.model = torchmoji_emojis(PRETRAINED_PATH)
     # Running predictions
     self.dangoURL = "https://emoji.getdango.com/api/emoji?q="
Esempio n. 7
0
    def __init__(self, *args, **kwargs):
        HTTPServer.__init__(self, *args, **kwargs)
        with open(vocab_file_path, 'r') as f:
            vocabulary = json.load(f)

        max_sentence_length = 100

        self.st = SentenceTokenizer(vocabulary, max_sentence_length)
        self.model = torchmoji_emojis(model_weights_path)
 def __init__(self, use_cuda=True):
     super(MojiModel, self).__init__()
     self.use_cuda = use_cuda
     self.EMOJIS = EMOJIS
     self.emoji_model = torchmoji_emojis(PRETRAINED_PATH)
     with open(VOCAB_PATH, 'r') as f:
         vocabulary = json.load(f)
     self.tokenizer = SentenceTokenizer(vocabulary, 100)
     print(self.emoji_model)
     self.feat_model = torchmoji_feature_encoding(PRETRAINED_PATH)
     if use_cuda:
         self.emoji_model = self.emoji_model.cuda()
         self.feat_model = self.feat_model.cuda()
Esempio n. 9
0
 def __init__(self, vocab: Vocabulary) -> None:
     super().__init__(vocab)
     self.accuracy = MicroMetrics(vocab)
     self.label_index_to_label = self.vocab.get_index_to_token_vocabulary(
         'labels')
     final_concatenated_dimension = 64 * 3
     self.input_layer = torch.nn.Linear(
         in_features=final_concatenated_dimension, out_features=64)
     self.output_layer = torch.nn.Linear(
         in_features=64, out_features=vocab.get_vocab_size("labels"))
     self.sigmoid = nn.Sigmoid()
     with open(VOCAB_PATH, 'r') as f:
         self.vocabulary = json.load(f)
         self.st = SentenceTokenizer(self.vocabulary, 20)
     self.model = torchmoji_emojis(PRETRAINED_PATH)
Esempio n. 10
0
def init():
    global sentence_tokenizer
    global model
    global emoji_desc, emoji_unicode

    max_token = 30
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    
    sentence_tokenizer = SentenceTokenizer(vocabulary, max_token)
    model = torchmoji_emojis(PRETRAINED_PATH)

    with open('data/emoji_codes.json') as f:
        emoji_desc = json.load(f)

    with open('data/wanted_emojis.csv') as f:
        emoji_unicode = list(csv.reader(f))
Esempio n. 11
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        'I love mom\'s cooking',
        'I love how you never reply back..',
        'I love cruising with my homies',
        'I love messing with yo mind!!',
        'I love you and now you\'re just gone..',
        'This is shit',
        'This is the shit'
    ]

    expected = [
        np.array([36,  4,  8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31,  6, 30, 15, 13]),
        np.array([54, 44,  9, 50, 49]),
        np.array([46,  5, 27, 35, 34]),
        np.array([55, 32, 27,  1, 37]),
        np.array([48, 11,  6, 31,  9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokens, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    prob = model(tokens)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(list(prob)):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
Esempio n. 12
0
    def __init__(self, counter, name, max_concurrent_queries):
        super().__init__(counter, name, max_concurrent_queries)

        sys.path.append(os.path.join(self.data_dir, "tacotron2-PPP-1.3.0"))
        from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
        from torchmoji.model_def import torchmoji_emojis, torchmoji_feature_encoding
        from torchmoji.sentence_tokenizer import SentenceTokenizer

        self.log.debug("Loading model")

        with open(VOCAB_PATH, "r") as f:
            vocabulary = json.load(f)

        with torch.no_grad():
            self.tm_sentence_tokenizer = SentenceTokenizer(
                vocabulary, MAX_LEN, ignore_sentences_with_only_custom=True
            )
            self.tm_torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH)
            self.tm_model = torchmoji_emojis(PRETRAINED_PATH)

        self.log.debug("Model loaded")
Esempio n. 13
0
def test():
    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    if __name__ == "__main__":
        argparser = argparse.ArgumentParser()
        argparser.add_argument('--text',
                               type=str,
                               required=True,
                               help="Input text to emojize")
        argparser.add_argument('--maxlen',
                               type=int,
                               default=30,
                               help="Max length of input text")
        args = argparser.parse_args()

        # Tokenizing using dictionary
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        st = SentenceTokenizer(vocabulary, args.maxlen)

        # Loading model
        model = torchmoji_emojis(PRETRAINED_PATH)
        # Running predictions
        tokenized, _, _ = st.tokenize_sentences([args.text])
        # Get sentence probability
        prob = model(tokenized)[0]

        # Top emoji id
        emoji_ids = top_elements(prob, 5)

        # map to emojis
        emojis = map(lambda x: EMOJIS[x], emoji_ids)

        print(
            emoji.emojize("{} {}".format(args.text, ' '.join(emojis)),
                          use_aliases=True))
Esempio n. 14
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    expected = [
        np.array([36, 4, 8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31, 6, 30, 15, 13]),
        np.array([54, 44, 9, 50, 49]),
        np.array([46, 5, 27, 35, 34]),
        np.array([55, 32, 27, 1, 37]),
        np.array([48, 11, 6, 31, 9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokens, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    prob = model(tokens)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(list(prob)):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
def text_to_emoji(text, maxlen):
    # Tokenizing using dictionary
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)
    # Running predictions
    tokenized, _, _ = st.tokenize_sentences([text])
    # Get sentence probability
    prob = model(tokenized)[0]

    # Top emoji id
    emoji_ids = top_elements(prob, 5)

    # map to emojis
    emojis = map(lambda x: EMOJIS[x], emoji_ids)

    print(
        emoji.emojize("{} {}".format(text, ' '.join(emojis)),
                      use_aliases=True))
:sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \
:wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \
:angry: :no_good: :muscle: :facepunch: :purple_heart: \
:sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ')

# Specify the paths to the vocabulary and model weights files.
vocab_file_path = '/model/vocabulary.json'
model_weights_path = '/model/pytorch_model.bin'

with open(vocab_file_path, 'r') as f:
    vocabulary = json.load(f)

max_sentence_length = 100

st = SentenceTokenizer(vocabulary, max_sentence_length)
model = torchmoji_emojis(model_weights_path)


def predict(text):
    if not isinstance(text, list):
        text = [text]
    tokenized, _, _ = st.tokenize_sentences(text)
    prob = model(tokenized)[0]
    # Only keep the emoji with the highest confidence.
    emoji_ids = top_elements(prob, 1)
    emojis = list(map(lambda x: EMOJIS[x].strip(':'), emoji_ids))
    return emojis[0]


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
Esempio n. 17
0
import json
import numpy as np
import emoji


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 300)
model = torchmoji_emojis(PRETRAINED_PATH)


def emojify_sentences(l):
    tokenized, _, _ = st.tokenize_sentences(l)
    prob = model(tokenized)

    result = []
    for prob in [prob]:
        for i in range(len(l)):
            t_prob = prob[i]
            ind_top = top_elements(t_prob, 5)
            result.append(
                list([
                    emoji.emojize(EMOJIS[i], use_aliases=True),
                    float(t_prob[i])
Esempio n. 18
0
df_sample.sentiment = pd.to_numeric(df_sample.sentiment)
#checks that the sample mean is reasonable
np.mean(df_sample.sentiment)


#import tweets and replace text with full text if 'tweet' is a retweet
df = pd.read_json('immigrationTweets.json')
df.text[~df.retweeted_status.isnull()] = df[~df.retweeted_status.isnull()].retweeted_status.apply(lambda x: x.get('text'))
df = df[['id','user','text','lang','reply_count','retweet_count','retweeted_status','term']]


#import and parse emoji codes

#import vocab and model, define sentence tokenizer, set chunk_size
with open('/Users/ikennedy/Documents/GitHub/torchMoji/model/vocabulary.json') as f: vocab = json.load(f)
model = torchmoji_emojis('pytorch_model.bin')
st = SentenceTokenizer(vocab, 30)

#specifiy colums for full df for:
#twitter pull
df_full = pd.DataFrame(columns=['id','user','text','lang','reply_count','retweet_count','retweeted_status','term']+list(emoji_codes))
#Twitter sample
df_full = pd.DataFrame(columns=['sentiment', 'text']+list(emoji_codes))
#runn in a loops of 5000 to avoid overusing computational resources
chunk_size = 5000
i = 1000
chunk_size = 1000
for i in range(chunk_size,len(df)+chunk_size,chunk_size):
    if(i>len(df)):
        i = len(df)
        chunk_size = len(df) % chunk_size
Esempio n. 19
0
    'disapprove_estimate', 'disapprove_hi', 'disapprove_lo', 'formatted_date',
    'status_id', 'sum(numScore)', 'text', 'created_at', 'name'
]]

#import and parse emoji codes
emoji_codes = pd.read_json(
    '/Users/ikennedy/Work/UW/Code/GIT/cl_lda/twitter/emojicodes.json',
    orient='values',
    typ='series').str.extract(':(\w+):', expand=False).sort_index()

#import vocab and model, define sentence tokenizer, set chunk_size
os.getcwd()
with open('/Users/ikennedy/Documents/GitHub/torchMoji/model/vocabulary.json'
          ) as f:
    vocab = json.load(f)
model = torchmoji_emojis('twitter/pytorch_model.bin')
st = SentenceTokenizer(vocab, 30)

#specifiy colums for full df for:
#twitter pull
df_full = pd.DataFrame(
    columns=['sentiment', 'id', 'date', 'query', 'screen_name', 'text'] +
    list(emoji_codes))
#Twitter sample
#df_full = pd.DataFrame(columns=['sentiment', 'text']+list(emoji_codes))
#runn in a loops of 5000 to avoid overusing computational resources
chunk_size = 5000
i = 1000
chunk_size = 1000
for i in range(chunk_size, len(df) + chunk_size, chunk_size):
    if (i > len(df)):
Esempio n. 20
0
def get_model():
    pretrained_path = download_pretrained()
    return torchmoji_emojis(pretrained_path)
    def __init__(self,
                 classifier_dims,
                 num_classes,
                 embedding_dims,
                 gaussian_noise,
                 dropout,
                 internal_dims,
                 n_layers,
                 featurizer,
                 final_layer_builder,
                 n_tokens_in=64,
                 n_tokens_out=16,
                 capabilities2dims=dict(),
                 use_as_super=False,
                 **kwargs):
        super(LangFeaturesModel, self).__init__(classifier_dims,
                                                num_classes,
                                                embedding_dims,
                                                gaussian_noise,
                                                dropout,
                                                internal_dims,
                                                n_layers,
                                                featurizer,
                                                final_layer_builder,
                                                n_tokens_in,
                                                n_tokens_out,
                                                use_as_super=True,
                                                **kwargs)
        assert "capabilities" in kwargs
        capabilities = kwargs["capabilities"]
        kwargs[
            "rake_dims"] = kwargs["rake_dims"] if "rake_dims" in kwargs else 32
        kwargs[
            "yake_dims"] = kwargs["yake_dims"] if "yake_dims" in kwargs else 32
        assert "key_phrases" not in capabilities or (
            "key_phrases" in capabilities and "spacy" in capabilities)
        use_layer_norm = kwargs[
            "use_layer_norm"] if "use_layer_norm" in kwargs else False
        self.capabilities = capabilities
        embedding_dim = 8
        cap_to_dim_map = {
            "spacy": 128,
            "snlp": 32,
            "key_phrases": 64,
            "nltk": 192,
            "full_view": 64,
            "tmoji": 32,
            "ibm_max": 16,
            "gensim": 256,
            "fasttext_crawl": 256
        }
        cap_to_dim_map.update(capabilities2dims)
        all_dims = sum([cap_to_dim_map[c] for c in capabilities])
        self.cap_to_dim_map = cap_to_dim_map
        self.all_dims = all_dims

        if "spacy" in capabilities:
            tr = pytextrank.TextRank(token_lookback=7)
            self.nlp = spacy.load("en_core_web_lg", disable=[])
            self.nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
            spacy_in_dims = (96 * 2) + (11 * embedding_dim) + 2
            self.spacy_nn = ExpandContract(spacy_in_dims,
                                           cap_to_dim_map["spacy"],
                                           dropout,
                                           use_layer_norm=use_layer_norm,
                                           groups=(2, 4))

        if "fasttext_crawl" in capabilities:
            self.bpe = BPEmb(dim=200)
            self.cngram = CharNGram()
            fasttext_crawl_file = kwargs[
                "fasttext_crawl_file"] if "fasttext_crawl_file" in kwargs else "crawl-300d-2M-subword.bin"
            self.crawl = fasttext.load_model(fasttext_crawl_file)
            self.crawl_nn = ExpandContract(200 + 300 + 100,
                                           cap_to_dim_map["fasttext_crawl"],
                                           dropout,
                                           use_layer_norm=use_layer_norm,
                                           groups=(4, 4))

        if "gensim" in capabilities:
            gensim = [
                api.load("glove-twitter-50"),
                api.load("glove-wiki-gigaword-50"),
                api.load("word2vec-google-news-300"),
                api.load("conceptnet-numberbatch-17-06-300")
            ]
            self.gensim = gensim
            self.gensim_nn = ExpandContract(400,
                                            cap_to_dim_map["gensim"],
                                            dropout,
                                            use_layer_norm=use_layer_norm,
                                            groups=(4, 4))

        if "full_view" in capabilities:
            full_sent_in_dims = 300
            self.full_sent_nn = ExpandContract(full_sent_in_dims,
                                               cap_to_dim_map["full_view"],
                                               dropout,
                                               use_layer_norm=use_layer_norm,
                                               groups=(4, 4))

        if "snlp" in capabilities:
            import stanza
            self.snlp = stanza.Pipeline(
                'en',
                processors='tokenize,pos,lemma,depparse,ner',
                use_gpu=False,
                pos_batch_size=2048)
            self.snlp_nn = ExpandContract(embedding_dim * 5,
                                          cap_to_dim_map["snlp"],
                                          dropout,
                                          use_layer_norm=use_layer_norm)
        if "key_phrases" in capabilities:
            import yake
            self.kw_extractor = yake.KeywordExtractor(lan="en",
                                                      n=3,
                                                      dedupLim=0.9,
                                                      dedupFunc='seqm',
                                                      windowsSize=3,
                                                      top=10,
                                                      features=None)

            self.key_occ_cnt_pytextrank = nn.Embedding(8, embedding_dim)
            nn.init.normal_(self.key_occ_cnt_pytextrank.weight,
                            std=1 / embedding_dim)
            self.key_wc_pytextrank = nn.Embedding(4, embedding_dim)
            nn.init.normal_(self.key_wc_pytextrank.weight,
                            std=1 / embedding_dim)

            yake_dims = kwargs["yake_dims"] if "yake_dims" in kwargs else 32
            self.yake_dims = yake_dims
            self.yake_nn = ExpandContract(300,
                                          yake_dims,
                                          dropout,
                                          use_layer_norm=use_layer_norm,
                                          groups=(2, 2))

            try:
                from multi_rake import Rake
                rake_dims = kwargs["rake_dims"] if "rake_dims" in kwargs else 32
                self.rake_dims = rake_dims
                self.rake_nn = ExpandContract(300,
                                              rake_dims,
                                              dropout,
                                              use_layer_norm=use_layer_norm,
                                              groups=(2, 2))
                self.rake = Rake(language_code="en")
                keyphrases_dim = 2 * embedding_dim + rake_dims + yake_dims
            except:
                self.rake = None
                keyphrases_dim = 2 * embedding_dim + yake_dims
            self.keyphrase_nn = ExpandContract(keyphrases_dim,
                                               cap_to_dim_map["key_phrases"],
                                               dropout,
                                               use_layer_norm=use_layer_norm,
                                               groups=(4, 4))

        fasttext_file = kwargs[
            "fasttext_file"] if "fasttext_file" in kwargs else "wiki-news-300d-1M-subword.bin"
        if not set(capabilities).isdisjoint(
            {"key_phrases", "full_view", "nltk"}):
            self.text_model = fasttext.load_model(fasttext_file)

        self.pdict = get_all_tags()
        self.tag_em = nn.Embedding(len(self.pdict) + 1, embedding_dim)
        nn.init.normal_(self.tag_em.weight, std=1 / embedding_dim)

        self.sw_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.sw_em.weight, std=1 / embedding_dim)

        self.sent_start_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.sent_start_em.weight, std=1 / embedding_dim)

        self.is_oov_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.is_oov_em.weight, std=1 / embedding_dim)

        self.has_digit_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.has_digit_em.weight, std=1 / embedding_dim)

        self.is_mask_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.is_mask_em.weight, std=1 / embedding_dim)

        self.w_len = nn.Embedding(16, embedding_dim)
        nn.init.normal_(self.w_len.weight, std=1 / embedding_dim)

        self.wc_emb = nn.Embedding(16, embedding_dim)
        nn.init.normal_(self.wc_emb.weight, std=1 / embedding_dim)

        if "nltk" in capabilities:
            import rake_nltk
            from textblob import TextBlob
            from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer
            self.stop_words = set(stopwords.words('english'))
            self.rake_nltk = rake_nltk.Rake()
            self.key_wc_rake_nltk = nn.Embedding(4, embedding_dim)
            nn.init.normal_(self.key_wc_rake_nltk.weight,
                            std=1 / embedding_dim)
            self.nltk_sid = SentimentIntensityAnalyzer()
            self.vader_sid = VaderSentimentIntensityAnalyzer()
            in_dims = 310 + 5 * embedding_dim
            self.nltk_nn = ExpandContract(in_dims,
                                          cap_to_dim_map["nltk"],
                                          dropout,
                                          use_layer_norm=use_layer_norm,
                                          groups=(2, 4))

        if "ibm_max" in capabilities:
            from ..external import ModelWrapper
            self.ibm_max = ModelWrapper()
            for p in self.ibm_max.model.parameters():
                p.requires_grad = False
            self.ibm_nn = ExpandContract(6,
                                         cap_to_dim_map["ibm_max"],
                                         dropout,
                                         use_layer_norm=use_layer_norm,
                                         groups=(1, 1))

        if "tmoji" in capabilities:
            from torchmoji.sentence_tokenizer import SentenceTokenizer
            from torchmoji.model_def import torchmoji_emojis
            from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
            with open(VOCAB_PATH, 'r') as f:
                maxlen = self.n_tokens_in
                self.vocabulary = json.load(f)
                self.st = SentenceTokenizer(self.vocabulary, maxlen)
                self.tmoji = torchmoji_emojis(PRETRAINED_PATH)
                for p in self.tmoji.parameters():
                    p.requires_grad = False
            self.tm_nn = ExpandContract(64,
                                        cap_to_dim_map["tmoji"],
                                        dropout,
                                        use_layer_norm=use_layer_norm,
                                        groups=(1, 1))

        self.contract_nn = ExpandContract(self.all_dims,
                                          embedding_dims,
                                          dropout,
                                          use_layer_norm=True,
                                          unit_norm=False,
                                          groups=(4, 4))
        if not use_as_super:
            if featurizer == "cnn":
                self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)
            elif featurizer == "gru":
                self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims,
                                                n_tokens_out, classifier_dims,
                                                internal_dims, n_layers,
                                                gaussian_noise, dropout)
            elif featurizer == "basic":
                self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)

            elif featurizer == "transformer":
                self.attention_drop_proba = kwargs[
                    "attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0
                n_encoders = kwargs.pop("n_encoders", n_layers)
                n_decoders = kwargs.pop("n_decoders", n_layers)
                self.featurizer = TransformerFeaturizer(
                    n_tokens_in, embedding_dims, n_tokens_out, classifier_dims,
                    internal_dims, n_encoders, n_decoders, gaussian_noise,
                    dropout, self.attention_drop_proba)
            else:
                raise NotImplementedError()

            self.final_layer = final_layer_builder(classifier_dims,
                                                   n_tokens_out, num_classes,
                                                   dropout, **kwargs)
        if "stored_model" in kwargs:
            load_stored_params(self, kwargs["stored_model"])
        self.reg_layers = get_regularization_layers(self)
Esempio n. 22
0
    def __init__(self):
        self.maxlen = 30
        self.sent_tokenizer = SentenceTokenizer()

        # Model weights
        self.model = torchmoji_emojis(PRETRAINED_PATH)
Esempio n. 23
0
def text_to_emoji(input_text, max_length):
    #argparser = argparse.ArgumentParser()
    #argparser.add_argument('--text', type=str, required=True, help="Input text to emojize")
    #argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text")
    #args = argparser.parse_args()

    # Load dictionary for tokenizing
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    #print(f'vocabulary: {vocabulary}')

    with open(os.path.join(os.path.dirname(__file__),
                           './negative_words_parsed.txt'),
              'r',
              encoding='utf-8',
              errors='ignore') as negative_words_list:
        negative_words = list(negative_words_list)
        negative_words = [
            negative_word.rstrip('\n').lower()
            for negative_word in negative_words if negative_word != '\n'
        ]

    with open(
            os.path.join(os.path.dirname(__file__),
                         './positive_words_parsed.txt'),
            'r') as positive_words_list:
        positive_words = list(positive_words_list)
        positive_words = [
            positive_word.rstrip('\n').lower()
            for positive_word in positive_words if positive_word != '\n'
        ]

    st = SentenceTokenizer(vocabulary, max_length)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)

    # Running predictions
    # Determines the important words in the sentence
    tokenized, _, _ = st.tokenize_sentences([input_text])
    # Get sentence probability
    prob = model(tokenized)[0]

    # Top emotion id
    emotion_ids = top_elements(prob, 5)
    #print(f'top five emotion ids: {emotion_ids}')

    # map to emotions
    emotions = map(lambda x: EMOTIONS[x], emotion_ids)
    emotions = list(emotions)
    #print(f'emotions: {emotions}')
    user_feelings = positive_or_negative(emotions)
    #print(f'user_feelings: {user_feelings}')

    # Find the words that are contributing to the feeling
    user_positive_words = []
    user_negative_words = []

    for word in input_text.split(' '):
        if word in positive_words:
            user_positive_words.append(word)
        elif word in negative_words:
            user_negative_words.append(word)

    # map to emojis
    emojis = map(lambda x: EMOJIS[x], emotion_ids)
    #print(f'emojis: {list(emojis)}')
    main_vibe = list(emojis)[0]
    #print(f'main_vibe: {main_vibe}')

    json_to_bot = {
        "user_emotion": user_feelings,
        "positive": user_positive_words,
        "negative": user_negative_words,
        "main_vibe": main_vibe
    }
    return json.dumps(json_to_bot)