class MojiModel(nn.Module): def __init__(self, use_cuda=True): super(MojiModel, self).__init__() self.use_cuda = use_cuda self.EMOJIS = EMOJIS self.emoji_model = torchmoji_emojis(PRETRAINED_PATH) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) self.tokenizer = SentenceTokenizer(vocabulary, 100) print(self.emoji_model) self.feat_model = torchmoji_feature_encoding(PRETRAINED_PATH) if use_cuda: self.emoji_model = self.emoji_model.cuda() self.feat_model = self.feat_model.cuda() def predict(self, input_txt): input_txt = [input_txt] tokenized, _, _ = self.tokenizer.tokenize_sentences(input_txt) if self.use_cuda: tokenized = torch.cuda.LongTensor(tokenized.astype('int32')) prob = self.emoji_model(tokenized)[0] return prob def moji_feat(self, input_txt): input_txt = [input_txt] tokenized, _, _ = self.tokenizer.tokenize_sentences(input_txt) if self.use_cuda: tokenized = torch.cuda.LongTensor(tokenized.astype('int32')) return self.feat_model(tokenized)[0] def to_emoji(self, idx): return emoji.emojize(self.EMOJIS[idx], use_aliases=True)
def test_encode_texts(): """ Text encoding is stable. """ TEST_SENTENCES = ['I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit'] maxlen = 30 batch_size = 32 with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_feature_encoding(PRETRAINED_PATH) print(model) tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) encoding = model(tokenized) avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3) assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
async def predict_sentence_emojis(sentence: str, num_to_predict: int = 5) -> dict: """ Predict top n emojis based on the sentence :param sentence: sentence used in prediction :param num_to_predict: number of top emojis to return :return: Dictionary where key is predicted emoji and value is its probability """ with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, MAXLEN) model = torchmoji_emojis(PRETRAINED_PATH) print('Running predictions.') tokenized, _, _ = st.tokenize_sentences([sentence]) prob = model(tokenized)[0] ind_top = top_elements(prob, num_to_predict) emojis = list(map(lambda x: EMOJIS[x], ind_top)) # Might be useful if we need to send it this way # emojis_unicode_escape = [unicode_codes.EMOJI_ALIAS_UNICODE[emoj].encode('unicode-escape') for emoj in emojis] emojis_unicode = [ unicode_codes.EMOJI_ALIAS_UNICODE[emoj] for emoj in emojis ] return dict(zip(emojis_unicode, prob[ind_top]))
def test_encode_texts(): """ Text encoding is stable. """ TEST_SENTENCES = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] maxlen = 30 batch_size = 32 with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_feature_encoding(PRETRAINED_PATH) print(model) tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) encoding = model(tokenized) avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3) assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
class EmotionBiLSTM(Model): def __init__(self, vocab: Vocabulary) -> None: super().__init__(vocab) self.accuracy = MicroMetrics(vocab) self.label_index_to_label = self.vocab.get_index_to_token_vocabulary( 'labels') final_concatenated_dimension = 64 * 3 self.input_layer = torch.nn.Linear( in_features=final_concatenated_dimension, out_features=64) self.output_layer = torch.nn.Linear( in_features=64, out_features=vocab.get_vocab_size("labels")) self.sigmoid = nn.Sigmoid() with open(VOCAB_PATH, 'r') as f: self.vocabulary = json.load(f) self.st = SentenceTokenizer(self.vocabulary, 20) self.model = torchmoji_emojis(PRETRAINED_PATH) def tokenize(self, sentences): tokenized, _, _ = self.st.tokenize_sentences(sentences) return torch.from_numpy(tokenized.astype(np.int)) def forward(self, turn1, turn2, turn3, conversation_id: str, turns: str, labels: torch.Tensor = None): #TODO Looku up reverse embedding of padded sequences turn1 = [x['turn1'] for x in turn1] turn2 = [x['turn2'] for x in turn2] turn3 = [x['turn3'] for x in turn3] predictions1 = self.model(self.tokenize(turn1)) predictions2 = self.model(self.tokenize(turn2)) predictions3 = self.model(self.tokenize(turn3)) predictions = torch.cat([predictions1, predictions2, predictions3], dim=1) input2hidden = self.input_layer(predictions) label_logits = self.sigmoid(self.output_layer(input2hidden)) # self.matrix_attention = self.matrix_attention(encoded_turn1and2, encoded_turn3) label_logits = F.softmax(label_logits, dim=1) output = { "prediction": [ self.label_index_to_label[x] for x in label_logits.argmax(dim=1).numpy() ], "ids": [x["ids"] for x in conversation_id], "turns": [x["turns"] for x in turns] } if labels is not None: #TODO check loss without and with mask self.accuracy(label_logits, labels) output["loss"] = cross_entropy_loss(label_logits, labels) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {"accuracy": self.accuracy.get_metric(reset)}
def get_emotion_features_from_text(text, audio_filename): # https://github.com/huggingface/torchMoji/blob/master/examples/score_texts_emojis.py if text == '': emoji_ids = [] one_hot_encodings = [] else: text = [text] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] maxlen = 30 with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) model = torchmoji_emojis(PRETRAINED_PATH) tokenized, _, _ = st.tokenize_sentences(text) prob = model(tokenized) for prob in [prob]: # Find top emojis for each sentence. Emoji ids (0-63) # correspond to the mapping in emoji_overview.png # at the root of the torchMoji repo. scores = [] for i, t in enumerate(text): t_tokens = tokenized[i] t_score = [t] t_prob = prob[i] ind_top = top_elements(t_prob, 5) t_score.append(sum(t_prob[ind_top])) t_score.extend(ind_top) t_score.extend([t_prob[ind] for ind in ind_top]) scores.append(t_score) emoji_ids = scores[0][2:2 + 5] one_hot_encodings = [] for emoji_idx in emoji_ids: one_hot_encodings.append( [0 if i != emoji_idx else 1 for i in range(64)]) a = audio_filename.split('/') filename = '/' + '/'.join( a[1:-1]) + '/onehot_emotion_' + a[-1].split('.wav')[0] + '.pkl' with open(filename, 'wb') as f: pickle.dump(one_hot_encodings, f) filename = '/' + '/'.join( a[1:-1]) + '/emoji_ids_' + a[-1].split('.wav')[0] + '.pkl' with open(filename, 'wb') as f: pickle.dump(emoji_ids, f) return emoji_ids, one_hot_encodings
def test_id_to_sentence(): """Tokenizing and converting back preserves the input. """ vb = {'CUSTOM_MASK': 0, 'aasdf': 1000, 'basdf': 2000} sentence = 'aasdf basdf basdf basdf' st = SentenceTokenizer(vb, 30) token, _, _ = st.tokenize_sentences([sentence]) assert st.to_sentence(token[0]) == sentence
def test_id_to_sentence_with_unknown(): """Tokenizing and converting back preserves the input, except for unknowns. """ vb = {'CUSTOM_MASK': 0, 'CUSTOM_UNKNOWN': 1, 'aasdf': 1000, 'basdf': 2000} sentence = 'aasdf basdf ccc' expected = 'aasdf basdf CUSTOM_UNKNOWN' st = SentenceTokenizer(vb, 30) token, _, _ = st.tokenize_sentences([sentence]) assert st.to_sentence(token[0]) == expected
class EmojiPredictor(object): def __init__(self): # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) self.st = SentenceTokenizer(vocabulary, 30) # Loading model self.model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions self.dangoURL = "https://emoji.getdango.com/api/emoji?q=" def getPredictedEmojis(self, text): api_response = '' try: #turned out that Dango has stopped the api service. #we might just use the deepmoji model r = requests.get("https://emoji.getdango.com/api/emoji", params={"q": text}) api_response = json.loads(r.text) except: pass if 'results' in api_response: res = [item['text'] for item in api_response['results']] if len(res) < 5: extraemojis = self.localPredict(text) for k in extraemojis: if k not in res: res.append(k) if len(res) == 5: return res else: return res[:5] else: return self.localPredict(text) def localPredict(self, text): tokenized, _, _ = self.st.tokenize_sentences([text.lower()]) # Get sentence probability prob = self.model(tokenized)[0] # Top emoji id emoji_ids = top_elements(prob, 6) np.setdiff1d(emoji_ids, [42]) if len(emoji_ids) > 5: emoji_ids = emoji_ids[:5] # map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) return emoji.emojize(' '.join(emojis), use_aliases=True).split()
def test_score_emoji(): """ Emoji predictions make sense. """ test_sentences = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] expected = [ np.array([36, 4, 8, 16, 47]), np.array([1, 19, 55, 25, 46]), np.array([31, 6, 30, 15, 13]), np.array([54, 44, 9, 50, 49]), np.array([46, 5, 27, 35, 34]), np.array([55, 32, 27, 1, 37]), np.array([48, 11, 6, 31, 9]) ] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] # Initialize by loading dictionary and tokenize texts with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) tokens, _, _ = st.tokenize_sentences(test_sentences) # Load model and run model = torchmoji_emojis(weight_path=PRETRAINED_PATH) prob = model(tokens) # Find top emojis for each sentence for i, t_prob in enumerate(list(prob)): assert np.array_equal(top_elements(t_prob, 5), expected[i])
def test(): def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('--text', type=str, required=True, help="Input text to emojize") argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text") args = argparser.parse_args() # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, args.maxlen) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions tokenized, _, _ = st.tokenize_sentences([args.text]) # Get sentence probability prob = model(tokenized)[0] # Top emoji id emoji_ids = top_elements(prob, 5) # map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) print( emoji.emojize("{} {}".format(args.text, ' '.join(emojis)), use_aliases=True))
def test_score_emoji(): """ Emoji predictions make sense. """ test_sentences = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] expected = [ np.array([36, 4, 8, 16, 47]), np.array([1, 19, 55, 25, 46]), np.array([31, 6, 30, 15, 13]), np.array([54, 44, 9, 50, 49]), np.array([46, 5, 27, 35, 34]), np.array([55, 32, 27, 1, 37]), np.array([48, 11, 6, 31, 9]) ] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] # Initialize by loading dictionary and tokenize texts with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) tokens, _, _ = st.tokenize_sentences(test_sentences) # Load model and run model = torchmoji_emojis(weight_path=PRETRAINED_PATH) prob = model(tokens) # Find top emojis for each sentence for i, t_prob in enumerate(list(prob)): assert np.array_equal(top_elements(t_prob, 5), expected[i])
def test_dataset_split_explicit(): """ Dataset is split according to given indices """ split_parameter = [train_ind, val_ind, test_ind] st = SentenceTokenizer(vocab, 30) tokenized, _, _ = st.tokenize_sentences(sentences) result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] tokenized = tokenized for i, sentence in enumerate(sentences): if i in train_ind: assert tokenized[i] in train assert dicts[i] in train_dicts elif i in val_ind: assert tokenized[i] in val assert dicts[i] in val_dicts elif i in test_ind: assert tokenized[i] in test assert dicts[i] in test_dicts assert len(train) == len(train_ind) assert len(val) == len(val_ind) assert len(test) == len(test_ind) assert len(train_dicts) == len(train_ind) assert len(val_dicts) == len(val_ind) assert len(test_dicts) == len(test_ind)
def text_to_emoji(text, maxlen): # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions tokenized, _, _ = st.tokenize_sentences([text]) # Get sentence probability prob = model(tokenized)[0] # Top emoji id emoji_ids = top_elements(prob, 5) # map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) print( emoji.emojize("{} {}".format(text, ' '.join(emojis)), use_aliases=True))
class Emoji(runner.Runner): name = "emoji" def __init__(self, counter, name, max_concurrent_queries): super().__init__(counter, name, max_concurrent_queries) sys.path.append(os.path.join(self.data_dir, "tacotron2-PPP-1.3.0")) from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH from torchmoji.model_def import torchmoji_emojis, torchmoji_feature_encoding from torchmoji.sentence_tokenizer import SentenceTokenizer self.log.debug("Loading model") with open(VOCAB_PATH, "r") as f: vocabulary = json.load(f) with torch.no_grad(): self.tm_sentence_tokenizer = SentenceTokenizer( vocabulary, MAX_LEN, ignore_sentences_with_only_custom=True ) self.tm_torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH) self.tm_model = torchmoji_emojis(PRETRAINED_PATH) self.log.debug("Model loaded") async def func(self, request, **kwargs): text_batch = [self.normalize_input(request)] text_batch = [ text.replace('"', "") for text in text_batch ] # remove quotes from text tokenized, _, _ = self.tm_sentence_tokenizer.tokenize_sentences(text_batch) prob = self.tm_model(tokenized)[0] emoji_ids = top_elements(prob, 3) emojis = map(lambda x: EMOJIS[x], emoji_ids) emoji_score = [emoji.emojize(e, use_aliases=True) for e in emojis] return emoji_score
from __future__ import print_function, division, unicode_literals import json import numpy as np from torchmoji.sentence_tokenizer import SentenceTokenizer from torchmoji.model_def import torchmoji_emojis from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH import sys print("start the file") message = sys.argv[1] + "" def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] maxlen = 30 print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) model = torchmoji_emojis(PRETRAINED_PATH) print(model) tokenized, _, _ = st.tokenize_sentences([message]) prob = model(tokenized) ind_top = top_elements(prob[0], 5) print(ind_top)
columns=['sentiment', 'id', 'date', 'query', 'screen_name', 'text'] + list(emoji_codes)) #Twitter sample #df_full = pd.DataFrame(columns=['sentiment', 'text']+list(emoji_codes)) #runn in a loops of 5000 to avoid overusing computational resources chunk_size = 5000 i = 1000 chunk_size = 1000 for i in range(chunk_size, len(df) + chunk_size, chunk_size): if (i > len(df)): i = len(df) chunk_size = len(df) % chunk_size #grab the subset of documents documents = list(df.text[i - chunk_size:i]) #tokenize them tokens, infos, stats = st.tokenize_sentences(documents) #fit the probabilities prob = model(tokens) #append the results to df_full df_full = df_full.append(df[i - chunk_size:i].join( pd.DataFrame(prob, columns=emoji_codes, index=range(i - chunk_size, i)))) #print the status print("finished processing", i, "documents") #confirm shape df_full.shape #save as csv filename = '/Users/ikennedy/Work/UW/Class/SICSS/SICSS R/emoji_tweet_sample.csv' df_full.to_csv(filename)
with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) # Use torchMoji to encode texts into emotional feature vectors. data = { 'texts': [], 'batch_size': batch_size, 'labels': [], 'maxlen': maxlen, 'added': 0 } for i, df in enumerate(tables): print(tables_meaning[i]) with open("data/{}_eng.json".format(tables_meaning[i]), "r") as f: translated = json.load(f) tokenized, _, _ = st.tokenize_sentences(translated) data['texts'].append(tokenized) # add labels feelings = [ 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust' ] for i, df in enumerate(tables): tmp_data = data['texts'][i] new_data = [] labels = [] for ri in range(len(df)): # labels.append([df.iloc[ri][f] == '1' for fi, f in enumerate(feelings)]) not_find = True for fi, f in enumerate(feelings):
return ind[np.argsort(array[ind])][::-1] maxlen = 30 print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_emojis(PRETRAINED_PATH) print(model) print('Running predictions.') tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) prob = model(tokenized) for prob in [prob]: # Find top emojis for each sentence. Emoji ids (0-63) # correspond to the mapping in emoji_overview.png # at the root of the torchMoji repo. print('Writing results to {}'.format(OUTPUT_PATH)) scores = [] for i, t in enumerate(TEST_SENTENCES): t_tokens = tokenized[i] t_score = [t] t_prob = prob[i] ind_top = top_elements(t_prob, 5) t_score.append(sum(t_prob[ind_top])) t_score.extend(ind_top)
class LangFeaturesModel(Fasttext1DCNNModel): def __init__(self, classifier_dims, num_classes, embedding_dims, gaussian_noise, dropout, internal_dims, n_layers, featurizer, final_layer_builder, n_tokens_in=64, n_tokens_out=16, capabilities2dims=dict(), use_as_super=False, **kwargs): super(LangFeaturesModel, self).__init__(classifier_dims, num_classes, embedding_dims, gaussian_noise, dropout, internal_dims, n_layers, featurizer, final_layer_builder, n_tokens_in, n_tokens_out, use_as_super=True, **kwargs) assert "capabilities" in kwargs capabilities = kwargs["capabilities"] kwargs[ "rake_dims"] = kwargs["rake_dims"] if "rake_dims" in kwargs else 32 kwargs[ "yake_dims"] = kwargs["yake_dims"] if "yake_dims" in kwargs else 32 assert "key_phrases" not in capabilities or ( "key_phrases" in capabilities and "spacy" in capabilities) use_layer_norm = kwargs[ "use_layer_norm"] if "use_layer_norm" in kwargs else False self.capabilities = capabilities embedding_dim = 8 cap_to_dim_map = { "spacy": 128, "snlp": 32, "key_phrases": 64, "nltk": 192, "full_view": 64, "tmoji": 32, "ibm_max": 16, "gensim": 256, "fasttext_crawl": 256 } cap_to_dim_map.update(capabilities2dims) all_dims = sum([cap_to_dim_map[c] for c in capabilities]) self.cap_to_dim_map = cap_to_dim_map self.all_dims = all_dims if "spacy" in capabilities: tr = pytextrank.TextRank(token_lookback=7) self.nlp = spacy.load("en_core_web_lg", disable=[]) self.nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) spacy_in_dims = (96 * 2) + (11 * embedding_dim) + 2 self.spacy_nn = ExpandContract(spacy_in_dims, cap_to_dim_map["spacy"], dropout, use_layer_norm=use_layer_norm, groups=(2, 4)) if "fasttext_crawl" in capabilities: self.bpe = BPEmb(dim=200) self.cngram = CharNGram() fasttext_crawl_file = kwargs[ "fasttext_crawl_file"] if "fasttext_crawl_file" in kwargs else "crawl-300d-2M-subword.bin" self.crawl = fasttext.load_model(fasttext_crawl_file) self.crawl_nn = ExpandContract(200 + 300 + 100, cap_to_dim_map["fasttext_crawl"], dropout, use_layer_norm=use_layer_norm, groups=(4, 4)) if "gensim" in capabilities: gensim = [ api.load("glove-twitter-50"), api.load("glove-wiki-gigaword-50"), api.load("word2vec-google-news-300"), api.load("conceptnet-numberbatch-17-06-300") ] self.gensim = gensim self.gensim_nn = ExpandContract(400, cap_to_dim_map["gensim"], dropout, use_layer_norm=use_layer_norm, groups=(4, 4)) if "full_view" in capabilities: full_sent_in_dims = 300 self.full_sent_nn = ExpandContract(full_sent_in_dims, cap_to_dim_map["full_view"], dropout, use_layer_norm=use_layer_norm, groups=(4, 4)) if "snlp" in capabilities: import stanza self.snlp = stanza.Pipeline( 'en', processors='tokenize,pos,lemma,depparse,ner', use_gpu=False, pos_batch_size=2048) self.snlp_nn = ExpandContract(embedding_dim * 5, cap_to_dim_map["snlp"], dropout, use_layer_norm=use_layer_norm) if "key_phrases" in capabilities: import yake self.kw_extractor = yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, dedupFunc='seqm', windowsSize=3, top=10, features=None) self.key_occ_cnt_pytextrank = nn.Embedding(8, embedding_dim) nn.init.normal_(self.key_occ_cnt_pytextrank.weight, std=1 / embedding_dim) self.key_wc_pytextrank = nn.Embedding(4, embedding_dim) nn.init.normal_(self.key_wc_pytextrank.weight, std=1 / embedding_dim) yake_dims = kwargs["yake_dims"] if "yake_dims" in kwargs else 32 self.yake_dims = yake_dims self.yake_nn = ExpandContract(300, yake_dims, dropout, use_layer_norm=use_layer_norm, groups=(2, 2)) try: from multi_rake import Rake rake_dims = kwargs["rake_dims"] if "rake_dims" in kwargs else 32 self.rake_dims = rake_dims self.rake_nn = ExpandContract(300, rake_dims, dropout, use_layer_norm=use_layer_norm, groups=(2, 2)) self.rake = Rake(language_code="en") keyphrases_dim = 2 * embedding_dim + rake_dims + yake_dims except: self.rake = None keyphrases_dim = 2 * embedding_dim + yake_dims self.keyphrase_nn = ExpandContract(keyphrases_dim, cap_to_dim_map["key_phrases"], dropout, use_layer_norm=use_layer_norm, groups=(4, 4)) fasttext_file = kwargs[ "fasttext_file"] if "fasttext_file" in kwargs else "wiki-news-300d-1M-subword.bin" if not set(capabilities).isdisjoint( {"key_phrases", "full_view", "nltk"}): self.text_model = fasttext.load_model(fasttext_file) self.pdict = get_all_tags() self.tag_em = nn.Embedding(len(self.pdict) + 1, embedding_dim) nn.init.normal_(self.tag_em.weight, std=1 / embedding_dim) self.sw_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.sw_em.weight, std=1 / embedding_dim) self.sent_start_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.sent_start_em.weight, std=1 / embedding_dim) self.is_oov_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.is_oov_em.weight, std=1 / embedding_dim) self.has_digit_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.has_digit_em.weight, std=1 / embedding_dim) self.is_mask_em = nn.Embedding(2, embedding_dim) nn.init.normal_(self.is_mask_em.weight, std=1 / embedding_dim) self.w_len = nn.Embedding(16, embedding_dim) nn.init.normal_(self.w_len.weight, std=1 / embedding_dim) self.wc_emb = nn.Embedding(16, embedding_dim) nn.init.normal_(self.wc_emb.weight, std=1 / embedding_dim) if "nltk" in capabilities: import rake_nltk from textblob import TextBlob from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer self.stop_words = set(stopwords.words('english')) self.rake_nltk = rake_nltk.Rake() self.key_wc_rake_nltk = nn.Embedding(4, embedding_dim) nn.init.normal_(self.key_wc_rake_nltk.weight, std=1 / embedding_dim) self.nltk_sid = SentimentIntensityAnalyzer() self.vader_sid = VaderSentimentIntensityAnalyzer() in_dims = 310 + 5 * embedding_dim self.nltk_nn = ExpandContract(in_dims, cap_to_dim_map["nltk"], dropout, use_layer_norm=use_layer_norm, groups=(2, 4)) if "ibm_max" in capabilities: from ..external import ModelWrapper self.ibm_max = ModelWrapper() for p in self.ibm_max.model.parameters(): p.requires_grad = False self.ibm_nn = ExpandContract(6, cap_to_dim_map["ibm_max"], dropout, use_layer_norm=use_layer_norm, groups=(1, 1)) if "tmoji" in capabilities: from torchmoji.sentence_tokenizer import SentenceTokenizer from torchmoji.model_def import torchmoji_emojis from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH with open(VOCAB_PATH, 'r') as f: maxlen = self.n_tokens_in self.vocabulary = json.load(f) self.st = SentenceTokenizer(self.vocabulary, maxlen) self.tmoji = torchmoji_emojis(PRETRAINED_PATH) for p in self.tmoji.parameters(): p.requires_grad = False self.tm_nn = ExpandContract(64, cap_to_dim_map["tmoji"], dropout, use_layer_norm=use_layer_norm, groups=(1, 1)) self.contract_nn = ExpandContract(self.all_dims, embedding_dims, dropout, use_layer_norm=True, unit_norm=False, groups=(4, 4)) if not use_as_super: if featurizer == "cnn": self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "gru": self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "basic": self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "transformer": self.attention_drop_proba = kwargs[ "attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0 n_encoders = kwargs.pop("n_encoders", n_layers) n_decoders = kwargs.pop("n_decoders", n_layers) self.featurizer = TransformerFeaturizer( n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_encoders, n_decoders, gaussian_noise, dropout, self.attention_drop_proba) else: raise NotImplementedError() self.final_layer = final_layer_builder(classifier_dims, n_tokens_out, num_classes, dropout, **kwargs) if "stored_model" in kwargs: load_stored_params(self, kwargs["stored_model"]) self.reg_layers = get_regularization_layers(self) def get_one_crawl_sentence_vector(self, tm, sentence): tokens = fasttext.tokenize(sentence) if isinstance(tm, fasttext.FastText._FastText): result = torch.tensor([tm[t] for t in tokens]) elif isinstance(tm, torchnlp.word_to_vector.char_n_gram.CharNGram): result = torch.stack([tm[t] for t in tokens]) else: result = tm[tokens] return result def get__crawl_word_vectors(self, texts: List[str]): bpe = self.bpe cngram = self.cngram tm = self.crawl n_tokens_in = self.n_tokens_in result = stack_and_pad_tensors( [self.get_one_crawl_sentence_vector(tm, text) for text in texts], n_tokens_in) res2 = stack_and_pad_tensors( [self.get_one_crawl_sentence_vector(bpe, text) for text in texts], n_tokens_in) res3 = stack_and_pad_tensors([ self.get_one_crawl_sentence_vector(cngram, text) for text in texts ], n_tokens_in) result = torch.cat([result, res2, res3], 2) result = result.to(get_device()) result = self.crawl_nn(result) return result def get_torchmoji_probas(self, texts: List[str]): tokenized, _, _ = self.st.tokenize_sentences(texts) with torch.no_grad(): prob = self.tmoji(tokenized) return torch.tensor(prob).to(get_device()) def get_one_sentence_vector(self, m, text): vs = min(m.vector_size, 150) zeros = np.zeros(vs) result = [ m[t][:150] if t in m else zeros for t in fasttext.tokenize(text) ] return torch.tensor(result, dtype=float) def get_gensim_word_vectors(self, texts: List[str]): n_tokens_in = self.n_tokens_in result = [] for m in self.gensim: r = stack_and_pad_tensors( [self.get_one_sentence_vector(m, text) for text in texts], n_tokens_in) result.append(r) result = [r.float() for r in result] result = torch.cat(result, 2) result = result.to(get_device()) result = self.gensim_nn(result) return result def get_nltk_vectors(self, texts: List[str]): # https://gist.github.com/japerk/1909413 from textblob import TextBlob sid = self.nltk_sid vsid = self.vader_sid pdict = self.pdict n_tokens_in = self.n_tokens_in rake = self.rake_nltk nltk_texts = [fasttext.tokenize(text) for text in texts] textblob_sentiments = [[ sentiment.polarity, sentiment.subjectivity ] for sentiment in [TextBlob(text).sentiment for text in texts]] textblob_sentiments = torch.tensor(textblob_sentiments).unsqueeze( 1).expand(len(texts), n_tokens_in, 2) textblob_sentiments = textblob_sentiments.to(get_device()) mask = stack_and_pad_tensors( list(map(lambda x: torch.ones(len(x), dtype=int), nltk_texts)), n_tokens_in) mask = mask.to(get_device()) mask = self.is_mask_em(mask) has_digit = stack_and_pad_tensors( list( map(lambda x: torch.tensor([has_digits(str(t)) for t in x]), nltk_texts)), n_tokens_in) has_digit = has_digit.to(get_device()) has_digit = self.has_digit_em(has_digit) m = self.text_model nltk_emb = stack_and_pad_tensors( [torch.tensor([m[t] for t in sent]) for sent in nltk_texts], n_tokens_in) # if t in m else np.zeros(m.vector_size) nltk_emb = nltk_emb.to(get_device()) sid_vec = torch.tensor( [list(sid.polarity_scores(t).values()) for t in texts]) sid_vec = sid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, sid_vec.size(1)) sid_vec = sid_vec.to(get_device()) vsid_vec = torch.tensor( [list(vsid.polarity_scores(t).values()) for t in texts]) vsid_vec = vsid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, vsid_vec.size(1)) vsid_vec = vsid_vec.to(get_device()) conlltags = [[ ptags for ptags in nltk.tree2conlltags(ne_chunk(pos_tag(x))) ] for x in nltk_texts] pos = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [pdict[tag.lower()] for token, tag, ne in x]), conlltags)), n_tokens_in) pos = pos.to(get_device()) pos_emb = self.tag_em(pos) ner = stack_and_pad_tensors( list( map( lambda x: torch.tensor([ pdict[ne.lower().split("-")[-1]] for token, tag, ne in x ]), conlltags)), n_tokens_in) ner = ner.to(get_device()) ner_emb = self.tag_em(ner) phrases = [get_rake_nltk_phrases(rake, t) for t in texts] key_wc_rake_nltk = [ get_rake_nltk_wc(tokens, phr) for tokens, phr in zip(nltk_texts, phrases) ] key_wc_rake_nltk = stack_and_pad_tensors(key_wc_rake_nltk, self.n_tokens_in) key_wc_rake_nltk = key_wc_rake_nltk.to(get_device()) nltk_rake_vectors = self.key_wc_rake_nltk(key_wc_rake_nltk) result = torch.cat([ vsid_vec, nltk_emb, textblob_sentiments, pos_emb, ner_emb, nltk_rake_vectors, sid_vec, mask, has_digit ], 2) result = result.to(get_device()) result = self.nltk_nn(result) return result def get_sentence_vector(self, texts: List[str]): tm = self.text_model n_tokens_in = self.n_tokens_in result = torch.tensor([tm.get_sentence_vector(text) for text in texts]) result = result.to(get_device()) result = self.full_sent_nn(result) result = result.unsqueeze(1).expand(len(texts), n_tokens_in, result.size(1)) return result def get_stanford_nlp_vectors(self, texts: List[str]): snlp = self.snlp pdict = self.pdict n_tokens_in = self.n_tokens_in docs = [ list( map(lambda x: dict(**x.to_dict()[0], ner=x.ner), snlp(doc).iter_tokens())) for doc in texts ] upos = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [pdict[token["upos"].lower()] for token in x]), docs)), n_tokens_in) upos_emb = self.tag_em(upos) xpos = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [pdict[token["xpos"].lower()] for token in x]), docs)), n_tokens_in) xpos_emb = self.tag_em(xpos) deprel = stack_and_pad_tensors( list( map( lambda x: torch.tensor([ pdict[token["deprel"].split(":")[0].lower()] for token in x ]), docs)), n_tokens_in) deprel_emb = self.tag_em(deprel) deprel2 = stack_and_pad_tensors( list( map( lambda x: torch.tensor([ pdict[token["deprel"].split(":")[1].lower()] if ":" in token["deprel"] else 0 for token in x ]), docs)), n_tokens_in) deprel_emb2 = self.tag_em(deprel2) sner = stack_and_pad_tensors( list( map( lambda x: torch.tensor([ pdict[token["ner"].split("-")[1].lower()] if "-" in token["ner"] else 0 for token in x ]), docs)), n_tokens_in) sner_emb = self.tag_em(sner) result = torch.cat( [upos_emb, xpos_emb, deprel_emb, sner_emb, deprel_emb2], 2) result = result.to(get_device()) result = self.snlp_nn(result) return result def get_spacy_nlp_vectors(self, texts: List[str]): pdict = self.pdict nlp = self.nlp n_tokens_in = self.n_tokens_in with torch.no_grad(): spacy_texts = list(nlp.pipe(texts, n_process=1)) text_tensors = list( map(lambda x: torch.tensor(x.tensor), spacy_texts)) text_tensors = stack_and_pad_tensors(text_tensors, n_tokens_in) head_tensors = stack_and_pad_tensors( list( map(lambda x: torch.tensor([t.head.tensor for t in x]), spacy_texts)), n_tokens_in) text_tensors = text_tensors.to(get_device()) head_tensors = head_tensors.to(get_device()) wl = stack_and_pad_tensors( list( map( lambda x: torch.tensor([len(token) - 1 for token in x]).clamp(0, 15), spacy_texts)), n_tokens_in) wl = wl.to(get_device()) wl_emb = self.w_len(wl) wc = (torch.tensor(list(map(len, spacy_texts))) // 10).long().unsqueeze(1).expand(len(texts), n_tokens_in) wc = wc.to(get_device()) wc_emb = self.wc_emb(wc) mask = stack_and_pad_tensors( list(map(lambda x: torch.ones(len(x), dtype=int), spacy_texts)), n_tokens_in) mask = mask.to(get_device()) mask = self.is_mask_em(mask) has_digit = stack_and_pad_tensors( list( map(lambda x: torch.tensor([has_digits(str(t)) for t in x]), spacy_texts)), n_tokens_in) has_digit = has_digit.to(get_device()) has_digit = self.has_digit_em(has_digit) pos = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [pdict[token.pos_.lower()] for token in x]), spacy_texts)), n_tokens_in) pos = pos.to(get_device()) pos_emb = self.tag_em(pos) tag = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [pdict[token.tag_.lower()] for token in x]), spacy_texts)), n_tokens_in) tag = tag.to(get_device()) tag_emb = self.tag_em(tag) dep = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [pdict[token.dep_.lower()] for token in x]), spacy_texts)), n_tokens_in) dep = dep.to(get_device()) dep_emb = self.tag_em(dep) sw = stack_and_pad_tensors( list( map( lambda x: torch.tensor([int(token.is_stop) for token in x]), spacy_texts)), n_tokens_in) sw = sw.to(get_device()) sw_emb = self.sw_em(sw) ner = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [pdict[token.ent_type_.lower()] for token in x]), spacy_texts)), n_tokens_in) ner = ner.to(get_device()) ner_emb = self.tag_em(ner) is_oov = stack_and_pad_tensors( list( map(lambda x: torch.tensor([int(token.is_oov) for token in x]), spacy_texts)), n_tokens_in) is_oov = is_oov.to(get_device()) is_oov_em = self.is_oov_em(is_oov) sent_start = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [int(token.sent_start) for token in x]), spacy_texts)), n_tokens_in) sent_start = sent_start.to(get_device()) sent_start_em = self.sent_start_em(sent_start) head_dist = stack_and_pad_tensors( list( map( lambda x: torch.tensor( [float(token.idx - token.head.idx) for token in x]), spacy_texts)), n_tokens_in) head_dist = head_dist.to(get_device()) head_dist = head_dist.unsqueeze(2).expand(len(texts), n_tokens_in, 2) result = torch.cat([ text_tensors, pos_emb, tag_emb, dep_emb, sw_emb, ner_emb, wl_emb, wc_emb, mask, has_digit, is_oov_em, sent_start_em, head_dist, head_tensors ], 2) result = result.to(get_device()) result = self.spacy_nn(result) return result, spacy_texts def get_ibm_max(self, texts: List[str]): with torch.no_grad(): result = self.ibm_max.predict(texts) result = result.to(get_device()) result = self.ibm_nn(result) result = result.unsqueeze(1).expand(len(texts), self.n_tokens_in, result.size(1)) return result def get_tmoji(self, texts: List[str]): with torch.no_grad(): tm_probas = self.get_torchmoji_probas(texts) tm_probas = self.tm_nn(tm_probas) tm_probas = tm_probas.unsqueeze(1).expand(len(texts), self.n_tokens_in, tm_probas.size(1)) return tm_probas def get_keyphrases(self, texts: List[str], spacy_texts): tm = self.text_model results = [get_pytextrank_wc_keylen(i) for i in spacy_texts] key_wc_pytextrank, key_occ_cnt_pytextrank = zip(*results) key_wc_pytextrank = stack_and_pad_tensors(key_wc_pytextrank, self.n_tokens_in) key_occ_cnt_pytextrank = stack_and_pad_tensors(key_occ_cnt_pytextrank, self.n_tokens_in) key_occ_cnt_pytextrank = key_occ_cnt_pytextrank.to(get_device()) key_wc_pytextrank = key_wc_pytextrank.to(get_device()) pytextrank_vectors = torch.cat( (self.key_wc_pytextrank(key_wc_pytextrank), self.key_occ_cnt_pytextrank(key_occ_cnt_pytextrank)), 2) # 16 pytextrank_vectors = pytextrank_vectors.to(get_device()) yake_ke = self.kw_extractor yake_embs = [[ tm.get_sentence_vector(s) for s in map(itemgetter(0), yake_ke.extract_keywords(t)) ] if has_words(t) else [np.zeros(300)] for t in texts] yake_embs = torch.tensor([ np.average( yk, axis=0, weights=softmax(list(range( len(yk), 0, -1)))).astype(np.float32) if len(yk) > 0 else np.zeros(tm.get_dimension(), dtype=np.float32) for yk in yake_embs ]) yake_embs = yake_embs.to(get_device()) yake_embs = self.yake_nn(yake_embs).unsqueeze(1).expand( len(texts), self.n_tokens_in, self.yake_dims) if self.rake is not None: rake_ke = self.rake rake_embs = [[ tm.get_sentence_vector(s) for s in map(itemgetter(0), rake_ke.apply(t)) ] if has_words(t) else [np.zeros(300)] for t in texts] rake_embs = torch.tensor([ np.average( rk, axis=0, weights=softmax(list(range( len(rk), 0, -1)))).astype(np.float32) if len(rk) > 0 else np.zeros(tm.get_dimension(), dtype=np.float32) for rk in rake_embs ]) rake_embs = rake_embs.to(get_device()) rake_embs = self.rake_nn(rake_embs).unsqueeze(1).expand( len(texts), self.n_tokens_in, self.rake_dims) result = torch.cat([pytextrank_vectors, yake_embs, rake_embs], 2) else: result = torch.cat([pytextrank_vectors, yake_embs], 2) result = result.to(get_device()) result = self.keyphrase_nn(result) return result def get_word_vectors(self, texts: List[str]): cap_method = { "snlp": self.get_stanford_nlp_vectors, "full_view": self.get_sentence_vector, "nltk": self.get_nltk_vectors, "ibm_max": self.get_ibm_max, "tmoji": self.get_tmoji, "gensim": self.get_gensim_word_vectors, "fasttext_crawl": self.get__crawl_word_vectors } results = [] if "spacy" in self.capabilities: r, spt = self.get_spacy_nlp_vectors(texts) results.append(r) if "key_phrases" in self.capabilities and "spacy" in self.capabilities: r = self.get_keyphrases(texts, spt) results.append(r) for c in self.capabilities: if c == "spacy" or c == "key_phrases": continue r = cap_method[c](texts) results.append(r) clean_memory() result = torch.cat(results, 2) result = result.to(get_device()) result = self.contract_nn(result) return result
if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('--text', type=str, required=True, help="Input text to emojize") argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text") args = argparser.parse_args() # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, args.maxlen) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions tokenized, _, _ = st.tokenize_sentences([args.text]) # Get sentence probability prob = model(tokenized)[0] # Top emoji id emoji_ids = top_elements(prob, 5) '''for emo in emoji_ids: print(emoji.emojize("emoji is : " + str(emo)+" , ")) print("-----------")''' # map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) print("+++++++++++++++++++++++") #print(list(emojis))
""" Take a given list of sentences and turn it into a numpy array, where each number corresponds to a word. Padding is used (number 0) to ensure fixed length of sentences. """ from __future__ import print_function, unicode_literals import example_helper import json from torchmoji.sentence_tokenizer import SentenceTokenizer with open('../model/vocabulary.json', 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) test_sentences = [ '\u2014 -- \u203c !!\U0001F602', 'Hello world!', 'This is a sample tweet #example', ] tokens, infos, stats = st.tokenize_sentences(test_sentences) print(tokens) print(infos) print(stats)
def text_to_emoji(input_text, max_length): #argparser = argparse.ArgumentParser() #argparser.add_argument('--text', type=str, required=True, help="Input text to emojize") #argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text") #args = argparser.parse_args() # Load dictionary for tokenizing with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) #print(f'vocabulary: {vocabulary}') with open(os.path.join(os.path.dirname(__file__), './negative_words_parsed.txt'), 'r', encoding='utf-8', errors='ignore') as negative_words_list: negative_words = list(negative_words_list) negative_words = [ negative_word.rstrip('\n').lower() for negative_word in negative_words if negative_word != '\n' ] with open( os.path.join(os.path.dirname(__file__), './positive_words_parsed.txt'), 'r') as positive_words_list: positive_words = list(positive_words_list) positive_words = [ positive_word.rstrip('\n').lower() for positive_word in positive_words if positive_word != '\n' ] st = SentenceTokenizer(vocabulary, max_length) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions # Determines the important words in the sentence tokenized, _, _ = st.tokenize_sentences([input_text]) # Get sentence probability prob = model(tokenized)[0] # Top emotion id emotion_ids = top_elements(prob, 5) #print(f'top five emotion ids: {emotion_ids}') # map to emotions emotions = map(lambda x: EMOTIONS[x], emotion_ids) emotions = list(emotions) #print(f'emotions: {emotions}') user_feelings = positive_or_negative(emotions) #print(f'user_feelings: {user_feelings}') # Find the words that are contributing to the feeling user_positive_words = [] user_negative_words = [] for word in input_text.split(' '): if word in positive_words: user_positive_words.append(word) elif word in negative_words: user_negative_words.append(word) # map to emojis emojis = map(lambda x: EMOJIS[x], emotion_ids) #print(f'emojis: {list(emojis)}') main_vibe = list(emojis)[0] #print(f'main_vibe: {main_vibe}') json_to_bot = { "user_emotion": user_feelings, "positive": user_positive_words, "negative": user_negative_words, "main_vibe": main_vibe } return json.dumps(json_to_bot)
class Dataset(data.Dataset): """Custom data.Dataset compatible with data.DataLoader.""" def __init__(self, data, vocab, hier=False, elmo=False, elmo_pre=None, deepmoji=False): self.id, self.X, self.y = data self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3} if (self.y is None): self.y = None else: self.y = np.array( list(map(lambda label: self.emotion2label[label], self.y))) self.vocab = vocab self.num_total_seqs = len(self.X) self.tt = MyTokenizer() with open(VOCAB_PATH, 'r') as f: deepmoji_vocab = json.load(f) self.deepmoji_tt = SentenceTokenizer(deepmoji_vocab, 100) self.hier = hier self.elmo = elmo self.elmo_pre = elmo_pre # pre-extracted elmo embeddings self.deepmoji = deepmoji def __getitem__(self, index): """Returns one data pair (source and target).""" ind = self.id[index] X_text = self.X[index] if (self.y is None): y = None else: y = self.y[index] if (self.hier): if self.elmo_pre is not None: f = lambda l, d: itemgetter(*l)( d) # get Tuple(values) with List[keys] X_1, X_2, X_3 = self.X[index][0], self.X[index][1], self.X[ index][2] return (*f([X_1.lower(), X_2.lower(), X_3.lower()], self.elmo_pre), y, ind, X_text) X_1, X_2, X_3 = self.preprocess(self.X[index]) return X_1, X_2, X_3, y, ind, X_text else: X = self.preprocess(self.X[index]) return X, y, ind, X_text def __len__(self): return self.num_total_seqs def vectorize(self, sentence): sequence = [] for word in self.tt.tokenize(clean_sentence(sentence)): if (word in text_to_emoji): word = text_to_emoji[word] # word = word.translate(None, string.punctuation) if constant.extra_prep: table = str.maketrans( {key: None for key in string.punctuation}) word = word.translate(table) if len(word) == 0: continue # the following code maybe not useful at all old_word = word if word not in constant.gen_vocabs: word = word.lower() if word not in constant.gen_vocabs: word = word[0].upper() + word[1:] if word not in constant.gen_vocabs: word = word.upper() if old_word not in constant.gen_vocabs: if word in constant.gen_vocabs: print(">", old_word, word) if word not in constant.gen_vocabs: word = old_word if word in self.vocab.word2index: sequence.append(self.vocab.word2index[word]) else: sequence.append(constant.UNK_idx) return sequence def preprocess(self, arr): """Converts words to ids.""" t1 = 'CLS ' + arr[0].lower() t2 = 'CLS ' + arr[1].lower() t3 = 'CLS ' + arr[2].lower() # print("preprocess deepmoji=", self.deepmoji) if self.elmo: t1 = self.tt.tokenize(clean_sentence(t1)) t2 = self.tt.tokenize(clean_sentence(t2)) t3 = self.tt.tokenize(clean_sentence(t3)) if self.hier: return t1, t2, t3 else: return np.concatenate((t1, t2, t3)) elif self.deepmoji: t1, _, _ = self.deepmoji_tt.tokenize_sentences([t1]) #vectorize t2, _, _ = self.deepmoji_tt.tokenize_sentences([t2]) t3, _, _ = self.deepmoji_tt.tokenize_sentences([t3]) t1 = np.trim_zeros(t1.astype(np.int32)[0]) t2 = np.trim_zeros(t2.astype(np.int32)[0]) t3 = np.trim_zeros(t3.astype(np.int32)[0]) if self.hier: return torch.LongTensor(t1), torch.LongTensor( t2), torch.LongTensor(t3) else: return torch.LongTensor(t1 + t2 + t3) else: t1 = self.vectorize(t1) t2 = self.vectorize(t2) t3 = self.vectorize(t3) if self.hier: return torch.LongTensor(t1), torch.LongTensor( t2), torch.LongTensor(t3) else: return torch.LongTensor(t1 + t2 + t3)
""" Take a given list of sentences and turn it into a numpy array, where each number corresponds to a word. Padding is used (number 0) to ensure fixed length of sentences. """ from __future__ import print_function, unicode_literals import example_helper import json from torchmoji.sentence_tokenizer import SentenceTokenizer with open('../model/vocabulary.json', 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) test_sentences = [ '\u2014 -- \u203c !!\U0001F602', 'Hello world!', 'This is a sample tweet #example', ] tokens, infos, stats = st.tokenize_sentences(test_sentences) print(tokens) print(infos) print(stats)
print(f'test_sentences length: {len(test_sentences[0])}') # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) #st = SentenceTokenizer(vocabulary, args.maxlen) st = SentenceTokenizer(vocabulary, 500) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions # Determines the important words in the sentence #tokenized, _, _ = st.tokenize_sentences([args.text]) tokenized, _, _ = st.tokenize_sentences(test_sentences[0]) #print(f'tokenized words: {tokenized}') # Get sentence probability #prob = model(tokenized)[0] print(f'tokenized: {tokenized}') prob = model(tokenized) for prob in [prob]: # Find top emojis for each sentence. Emoji ids (0-63) # correspond to the mapping in emoji_overview.png # at the root of the torchMoji repo. #print(f'prob:{prob}') print('Writing results to {}'.format(OUTPUT_PATH)) scores = [] print(f'prob: {prob}') for i, t in enumerate(test_sentences[0]):
TEST_SENTENCES = ['I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit'] maxlen = 30 batch_size = 32 print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_feature_encoding(PRETRAINED_PATH) print(model) print('Encoding texts..') encoding = model(tokenized) print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0])) print(encoding[0,:5]) # Now you could visualize the encodings to see differences, # run a logistic regression classifier on top, # or basically anything you'd like to do.