def __init__(self): with open(vocab_file_path, 'r') as f: vocabulary = json.load(f) max_sentence_length = 100 self.st = SentenceTokenizer(vocabulary, max_sentence_length) self.model = torchmoji_emojis(model_weights_path)
def test_dataset_split_parameter(): """ Dataset is split in the desired ratios """ split_parameter = [0.7, 0.1, 0.2] st = SentenceTokenizer(vocab, 30) result, result_dicts, _ = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] assert len(train) == len(sentences) * split_parameter[0] assert len(val) == len(sentences) * split_parameter[1] assert len(test) == len(sentences) * split_parameter[2] assert len(train_dicts) == len(dicts) * split_parameter[0] assert len(val_dicts) == len(dicts) * split_parameter[1] assert len(test_dicts) == len(dicts) * split_parameter[2]
def test_encode_texts(): """ Text encoding is stable. """ TEST_SENTENCES = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] maxlen = 30 batch_size = 32 with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_feature_encoding(PRETRAINED_PATH) print(model) tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) encoding = model(tokenized) avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3) assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
def __init__(self): """ Ctor. """ # Automatically download weights if not os.path.isfile(PRETRAINED_PATH): os.system( "(cd torchMoji && python scripts/download_weights_yes.py)") # Instanciate a pytorch model self._model = torchmoji_emojis(weight_path=PRETRAINED_PATH) # Load vocabulary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) # Create tokenizer to split a sentence into words self._st = SentenceTokenizer(vocabulary, self._max_message_len_words) # Load a mapping in neural network prediction to smileys emoji_codes_path = os.path.join(ROOT_PATH, "data", "emoji_codes.json") with open(emoji_codes_path, 'r') as f: self._emoji_codes = json.load(f) # This is a reduction of 64 smileys into there "happiness" bool flag with open("sentiment.json", 'r') as f: self._sentiments = json.load(f) pass
def load_benchmark(path, vocab, extend_with=0): """ Loads the given benchmark dataset. Tokenizes the texts using the provided vocabulary, extending it with words from the training dataset if extend_with > 0. Splits them into three lists: training, validation and testing (in that order). Also calculates the maximum length of the texts and the suggested batch_size. # Arguments: path: Path to the dataset to be loaded. vocab: Vocabulary to be used for tokenizing texts. extend_with: If > 0, the vocabulary will be extended with up to extend_with tokens from the training set before tokenizing. # Returns: A dictionary with the following fields: texts: List of three lists, containing tokenized inputs for training, validation and testing (in that order). labels: List of three lists, containing labels for training, validation and testing (in that order). added: Number of tokens added to the vocabulary. batch_size: Batch size. maxlen: Maximum length of an input. """ # Pre-processing dataset with open(path, 'rb') as dataset: if IS_PYTHON2: data = pickle.load(dataset) else: data = pickle.load(dataset, fix_imports=True) # Decode data try: texts = [unicode(x) for x in data['texts']] except UnicodeDecodeError: texts = [x.decode('utf-8') for x in data['texts']] # Extract labels labels = [x['label'] for x in data['info']] batch_size, maxlen = calculate_batchsize_maxlen(texts) st = SentenceTokenizer(vocab, maxlen) # Split up dataset. Extend the existing vocabulary with up to extend_with # tokens from the training dataset. texts, labels, added = st.split_train_val_test(texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) return {'texts': texts, 'labels': labels, 'added': added, 'batch_size': batch_size, 'maxlen': maxlen}
def test_id_to_sentence(): """Tokenizing and converting back preserves the input. """ vb = {'CUSTOM_MASK': 0, 'aasdf': 1000, 'basdf': 2000} sentence = 'aasdf basdf basdf basdf' st = SentenceTokenizer(vb, 30) token, _, _ = st.tokenize_sentences([sentence]) assert st.to_sentence(token[0]) == sentence
def __init__(self, max_sentence_length=30): # Tokenizing using the dictionary with open(VOCAB_PATH, 'r') as f: self.vocabulary = json.load(f) self.st = SentenceTokenizer(self.vocabulary, max_sentence_length) # Loading the model self.model = torchmoji_emojis(PRETRAINED_PATH)
def test_id_to_sentence_with_unknown(): """Tokenizing and converting back preserves the input, except for unknowns. """ vb = {'CUSTOM_MASK': 0, 'CUSTOM_UNKNOWN': 1, 'aasdf': 1000, 'basdf': 2000} sentence = 'aasdf basdf ccc' expected = 'aasdf basdf CUSTOM_UNKNOWN' st = SentenceTokenizer(vb, 30) token, _, _ = st.tokenize_sentences([sentence]) assert st.to_sentence(token[0]) == expected
def convert_dataset(filepath, extend_with, vocab): print('-- Generating {} '.format(filepath)) sys.stdout.flush() st = SentenceTokenizer(vocab, maxlen) tokenized, dicts, _ = st.split_train_val_test( texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2], dicts[0], dicts[1], dicts[2]) with open(filepath, 'w') as f: pickle.dump(pick, f) cover = coverage(tokenized[2]) print(' done. Coverage: {}'.format(cover))
def test_dataset_split_explicit(): """ Dataset is split according to given indices """ split_parameter = [train_ind, val_ind, test_ind] st = SentenceTokenizer(vocab, 30) tokenized, _, _ = st.tokenize_sentences(sentences) result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] tokenized = tokenized for i, sentence in enumerate(sentences): if i in train_ind: assert tokenized[i] in train assert dicts[i] in train_dicts elif i in val_ind: assert tokenized[i] in val assert dicts[i] in val_dicts elif i in test_ind: assert tokenized[i] in test assert dicts[i] in test_dicts assert len(train) == len(train_ind) assert len(val) == len(val_ind) assert len(test) == len(test_ind) assert len(train_dicts) == len(train_ind) assert len(val_dicts) == len(val_ind) assert len(test_dicts) == len(test_ind)
def test_score_emoji(): """ Emoji predictions make sense. """ test_sentences = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] expected = [ np.array([36, 4, 8, 16, 47]), np.array([1, 19, 55, 25, 46]), np.array([31, 6, 30, 15, 13]), np.array([54, 44, 9, 50, 49]), np.array([46, 5, 27, 35, 34]), np.array([55, 32, 27, 1, 37]), np.array([48, 11, 6, 31, 9]) ] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] # Initialize by loading dictionary and tokenize texts with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) tokens, _, _ = st.tokenize_sentences(test_sentences) # Load model and run model = torchmoji_emojis(weight_path=PRETRAINED_PATH) prob = model(tokens) # Find top emojis for each sentence for i, t_prob in enumerate(list(prob)): assert np.array_equal(top_elements(t_prob, 5), expected[i])
class Botmoji(): def __init__(self, max_sentence_length=30): # Tokenizing using the dictionary with open(VOCAB_PATH, 'r') as f: self.vocabulary = json.load(f) self.st = SentenceTokenizer(self.vocabulary, max_sentence_length) # Loading the model self.model = torchmoji_emojis(PRETRAINED_PATH) def emojize_text(self, text, maxemojis, minconfidence): prob = self.encode(text) # Top emoji ID emoji_ids = top_emojis(prob, maxemojis, minconfidence) if len(emoji_ids) == 0: return '' # Map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) return emoji.emojize(' '.join(emojis), use_aliases=True) def encode(self, text): # Running predictions tokenized, _, _ = self.st.tokenize_sentences([text]) # Getting emojis probabilities prob = self.model(tokenized)[0] return prob def encode_multiple(self, texts): filtered_texts = ['_' if text == '' else text for text in texts] # Running predictions tokenized, _, _ = self.st.tokenize_sentences(filtered_texts) # Getting emojis probabilities prob = self.model(tokenized) return prob
class Emojize: def __init__(self): with open(vocab_file_path, 'r') as f: vocabulary = json.load(f) max_sentence_length = 100 self.st = SentenceTokenizer(vocabulary, max_sentence_length) self.model = torchmoji_emojis(model_weights_path) def predict(self, text): if not isinstance(text, list): text = [text] tokenized, _, _ = self.st.tokenize_sentences(text) # print(tokenized) prob = self.model(tokenized)[0] # print("Prob", prob) emoji_ids = top_elements(prob, 1) print("Emo Id: ", emoji_ids) # Emoji map in emoji_overview.png # print(EMOJIS) emojis = EMOJIS[emoji_ids[0]] # print("emojis", emojis) return emojis
argparser = argparse.ArgumentParser() argparser.add_argument('--text', type=str, required=True, help="Input text to emojize") argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text") args = argparser.parse_args() # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, args.maxlen) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions tokenized, _, _ = st.tokenize_sentences([args.text]) # Get sentence probability prob = model(tokenized)[0] # Top emoji id emoji_ids = top_elements(prob, 5) # map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) print(
coverage_result = [p] print('Calculating coverage for {}'.format(p)) with open(p, 'rb') as f: if IS_PYTHON2: s = pickle.load(f) else: s = pickle.load(f, fix_imports=True) # Decode data try: s['texts'] = [unicode(x) for x in s['texts']] except UnicodeDecodeError: s['texts'] = [x.decode('utf-8') for x in s['texts']] # Own st = SentenceTokenizer({}, 30) tests, dicts, _ = st.split_train_val_test( s['texts'], s['info'], [s['train_ind'], s['val_ind'], s['test_ind']], extend_with=10000) coverage_result.append(coverage(tests[2])) # Last st = SentenceTokenizer(vocab, 30) tests, dicts, _ = st.split_train_val_test( s['texts'], s['info'], [s['train_ind'], s['val_ind'], s['test_ind']], extend_with=0) coverage_result.append(coverage(tests[2])) # Full
type=int, default=30, help="Max length of input text") args = argparser.parse_args() sentence_probs = [] retokenized_sentences = [] output_path = os.path.join(os.path.dirname(args.filepath), 'sentence_emojis.pkl') retokenized_sentences_output_path = os.path.join( os.path.dirname(args.filepath), 'retokenized_sentences.pkl') # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, args.maxlen) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) sentences = load_pickle(args.filepath) # TODO: encode multiple sentences at once. # Needs TorchMoji module to handle empty sentences and output equal probabilities # flattened_sentences = [utterance for conversation in sentences for utterance in conversation] # print('Encoding sentences ...') # flattened_tokenized, _, _ = st.tokenize_sentences(flattened_sentences) # flattened_probs = model(flattened_tokenized) # print('TorchMoji encoding done.') idx = 0 for conversation in sentences: idx += 1
'This is the shit' ] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] maxlen = 30 print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_emojis(PRETRAINED_PATH) print(model) print('Running predictions.') tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) prob = model(tokenized) for prob in [prob]: # Find top emojis for each sentence. Emoji ids (0-63) # correspond to the mapping in emoji_overview.png # at the root of the torchMoji repo. print('Writing results to {}'.format(OUTPUT_PATH)) scores = [] for i, t in enumerate(TEST_SENTENCES):
}, { 'label': 'sentence 5' }, { 'label': 'sentence 6' }, { 'label': 'sentence 7' }, { 'label': 'sentence 8' }, { 'label': 'sentence 9' }, ] with open('../model/vocabulary.json', 'r') as f: vocab = json.load(f) st = SentenceTokenizer(vocab, 30) # Split using the default split ratio print(st.split_train_val_test(DATASET, INFO_DICTS)) # Split explicitly print( st.split_train_val_test(DATASET, INFO_DICTS, [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], extend_with=1))
""" Take a given list of sentences and turn it into a numpy array, where each number corresponds to a word. Padding is used (number 0) to ensure fixed length of sentences. """ from __future__ import print_function, unicode_literals import json from torchMoji.torchmoji.sentence_tokenizer import SentenceTokenizer with open('../model/vocabulary.json', 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) test_sentences = [ '\u2014 -- \u203c !!\U0001F602', 'Hello world!', 'This is a sample tweet #example', ] tokens, infos, stats = st.tokenize_sentences(test_sentences) print(tokens) print(infos) print(stats)
class Sentiment: """ Wrapper class for torchMoji. Repo: https://github.com/huggingface/torchMoji Original DeepMoji paper: "Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm" https://arxiv.org/pdf/1708.00524.pdf """ # Maximal lenght of a sentense in words _max_message_len_words = 30 # Top K smiley predictions to derive sentiment from _top_k_predictions = 5 def __init__(self): """ Ctor. """ # Automatically download weights if not os.path.isfile(PRETRAINED_PATH): os.system( "(cd torchMoji && python scripts/download_weights_yes.py)") # Instanciate a pytorch model self._model = torchmoji_emojis(weight_path=PRETRAINED_PATH) # Load vocabulary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) # Create tokenizer to split a sentence into words self._st = SentenceTokenizer(vocabulary, self._max_message_len_words) # Load a mapping in neural network prediction to smileys emoji_codes_path = os.path.join(ROOT_PATH, "data", "emoji_codes.json") with open(emoji_codes_path, 'r') as f: self._emoji_codes = json.load(f) # This is a reduction of 64 smileys into there "happiness" bool flag with open("sentiment.json", 'r') as f: self._sentiments = json.load(f) pass def __call__(self, message: str) -> bool: """ Perform inference. :param message: text sentense :return: True if a sentence has positive sentiment, False - if negative """ assert isinstance(message, str) # Tokenize sentence tokens, _, _ = self._st.tokenize_sentences([message]) # Neural network inference prob, *_ = self._model(tokens) # Analyse only top K predictions out of 64 top = top_elements(prob, self._top_k_predictions) if False: # See original top-K smileys before reduction top_emoji = [self._emoji_codes[str(i)] for i in top] print(" ".join( [emoji.emojize(e, use_aliases=True) for e in top_emoji])) # Reduce to "happiness" flag votes = [self._sentiments[str(e)] for e in top] result = majority_vote(votes) return result