Python SentenceTokenizer.tokenize_sentences Examples, torchmoji.sentence_tokenizer.SentenceTokenizer.tokenize_sentences Python Examples

Example #1

0

Show file

File: emo_features.py Project: yana-xuyan/dialogue-emotion

class MojiModel(nn.Module):
    def __init__(self, use_cuda=True):
        super(MojiModel, self).__init__()
        self.use_cuda = use_cuda
        self.EMOJIS = EMOJIS
        self.emoji_model = torchmoji_emojis(PRETRAINED_PATH)
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)
        self.tokenizer = SentenceTokenizer(vocabulary, 100)
        print(self.emoji_model)
        self.feat_model = torchmoji_feature_encoding(PRETRAINED_PATH)
        if use_cuda:
            self.emoji_model = self.emoji_model.cuda()
            self.feat_model = self.feat_model.cuda()

    def predict(self, input_txt):
        input_txt = [input_txt]
        tokenized, _, _ = self.tokenizer.tokenize_sentences(input_txt)
        if self.use_cuda:
            tokenized = torch.cuda.LongTensor(tokenized.astype('int32'))
        prob = self.emoji_model(tokenized)[0]
        return prob

    def moji_feat(self, input_txt):
        input_txt = [input_txt]
        tokenized, _, _ = self.tokenizer.tokenize_sentences(input_txt)
        if self.use_cuda:
            tokenized = torch.cuda.LongTensor(tokenized.astype('int32'))
        return self.feat_model(tokenized)[0]

    def to_emoji(self, idx):
        return emoji.emojize(self.EMOJIS[idx], use_aliases=True)

Example #2

0

Show file

File: test_finetuning.py Project: cclauss/torchMoji

def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = ['I love mom\'s cooking',
                      'I love how you never reply back..',
                      'I love cruising with my homies',
                      'I love messing with yo mind!!',
                      'I love you and now you\'re just gone..',
                      'This is shit',
                      'This is the shit']


    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = torchmoji_feature_encoding(PRETRAINED_PATH)
    print(model)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
    encoding = model(tokenized)

    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))

Example #3

0

Show file

File: score_texts_emojis.py Project: neurotechuoft/NLPService

async def predict_sentence_emojis(sentence: str,
                                  num_to_predict: int = 5) -> dict:
    """
    Predict top n emojis based on the sentence
    :param sentence: sentence used in prediction
    :param num_to_predict: number of top emojis to return
    :return: Dictionary where key is predicted emoji and value is its probability
    """

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, MAXLEN)

    model = torchmoji_emojis(PRETRAINED_PATH)
    print('Running predictions.')
    tokenized, _, _ = st.tokenize_sentences([sentence])
    prob = model(tokenized)[0]

    ind_top = top_elements(prob, num_to_predict)
    emojis = list(map(lambda x: EMOJIS[x], ind_top))

    # Might be useful if we need to send it this way
    # emojis_unicode_escape = [unicode_codes.EMOJI_ALIAS_UNICODE[emoj].encode('unicode-escape') for emoj in emojis]

    emojis_unicode = [
        unicode_codes.EMOJI_ALIAS_UNICODE[emoj] for emoj in emojis
    ]
    return dict(zip(emojis_unicode, prob[ind_top]))

Example #4

0

Show file

def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = torchmoji_feature_encoding(PRETRAINED_PATH)
    print(model)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
    encoding = model(tokenized)

    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences,
                       np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))

Example #5

0

Show file

File: deep_emoji_lstm.py Project: iamsimha/semeval

class EmotionBiLSTM(Model):
    def __init__(self, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.accuracy = MicroMetrics(vocab)
        self.label_index_to_label = self.vocab.get_index_to_token_vocabulary(
            'labels')
        final_concatenated_dimension = 64 * 3
        self.input_layer = torch.nn.Linear(
            in_features=final_concatenated_dimension, out_features=64)
        self.output_layer = torch.nn.Linear(
            in_features=64, out_features=vocab.get_vocab_size("labels"))
        self.sigmoid = nn.Sigmoid()
        with open(VOCAB_PATH, 'r') as f:
            self.vocabulary = json.load(f)
            self.st = SentenceTokenizer(self.vocabulary, 20)
        self.model = torchmoji_emojis(PRETRAINED_PATH)

    def tokenize(self, sentences):
        tokenized, _, _ = self.st.tokenize_sentences(sentences)
        return torch.from_numpy(tokenized.astype(np.int))

    def forward(self,
                turn1,
                turn2,
                turn3,
                conversation_id: str,
                turns: str,
                labels: torch.Tensor = None):
        #TODO Looku up reverse embedding of padded sequences
        turn1 = [x['turn1'] for x in turn1]
        turn2 = [x['turn2'] for x in turn2]
        turn3 = [x['turn3'] for x in turn3]
        predictions1 = self.model(self.tokenize(turn1))
        predictions2 = self.model(self.tokenize(turn2))
        predictions3 = self.model(self.tokenize(turn3))
        predictions = torch.cat([predictions1, predictions2, predictions3],
                                dim=1)
        input2hidden = self.input_layer(predictions)
        label_logits = self.sigmoid(self.output_layer(input2hidden))

        # self.matrix_attention = self.matrix_attention(encoded_turn1and2, encoded_turn3)
        label_logits = F.softmax(label_logits, dim=1)
        output = {
            "prediction": [
                self.label_index_to_label[x]
                for x in label_logits.argmax(dim=1).numpy()
            ],
            "ids": [x["ids"] for x in conversation_id],
            "turns": [x["turns"] for x in turns]
        }
        if labels is not None:
            #TODO check loss without and with mask
            self.accuracy(label_logits, labels)
            output["loss"] = cross_entropy_loss(label_logits, labels)
        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

Example #6

0

Show file

def get_emotion_features_from_text(text, audio_filename):
    # https://github.com/huggingface/torchMoji/blob/master/examples/score_texts_emojis.py

    if text == '':
        emoji_ids = []
        one_hot_encodings = []
    else:
        text = [text]

        def top_elements(array, k):
            ind = np.argpartition(array, -k)[-k:]
            return ind[np.argsort(array[ind])][::-1]

        maxlen = 30

        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        st = SentenceTokenizer(vocabulary, maxlen)

        model = torchmoji_emojis(PRETRAINED_PATH)
        tokenized, _, _ = st.tokenize_sentences(text)
        prob = model(tokenized)

        for prob in [prob]:
            # Find top emojis for each sentence. Emoji ids (0-63)
            # correspond to the mapping in emoji_overview.png
            # at the root of the torchMoji repo.
            scores = []
            for i, t in enumerate(text):
                t_tokens = tokenized[i]
                t_score = [t]
                t_prob = prob[i]
                ind_top = top_elements(t_prob, 5)
                t_score.append(sum(t_prob[ind_top]))
                t_score.extend(ind_top)
                t_score.extend([t_prob[ind] for ind in ind_top])
                scores.append(t_score)

        emoji_ids = scores[0][2:2 + 5]
        one_hot_encodings = []
        for emoji_idx in emoji_ids:
            one_hot_encodings.append(
                [0 if i != emoji_idx else 1 for i in range(64)])

    a = audio_filename.split('/')

    filename = '/' + '/'.join(
        a[1:-1]) + '/onehot_emotion_' + a[-1].split('.wav')[0] + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(one_hot_encodings, f)

    filename = '/' + '/'.join(
        a[1:-1]) + '/emoji_ids_' + a[-1].split('.wav')[0] + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(emoji_ids, f)

    return emoji_ids, one_hot_encodings

Example #7

0

Show file

File: test_sentence_tokenizer.py Project: AdarshKumar712/PPCM

def test_id_to_sentence():
    """Tokenizing and converting back preserves the input.
    """
    vb = {'CUSTOM_MASK': 0, 'aasdf': 1000, 'basdf': 2000}

    sentence = 'aasdf basdf basdf basdf'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == sentence

Example #8

0

Show file

File: test_sentence_tokenizer.py Project: AdarshKumar712/PPCM

def test_id_to_sentence_with_unknown():
    """Tokenizing and converting back preserves the input, except for unknowns.
    """
    vb = {'CUSTOM_MASK': 0, 'CUSTOM_UNKNOWN': 1, 'aasdf': 1000, 'basdf': 2000}

    sentence = 'aasdf basdf ccc'
    expected = 'aasdf basdf CUSTOM_UNKNOWN'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == expected

Example #9

0

Show file

class EmojiPredictor(object):
    def __init__(self):
        # Tokenizing using dictionary
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)
        self.st = SentenceTokenizer(vocabulary, 30)
        # Loading model
        self.model = torchmoji_emojis(PRETRAINED_PATH)
        # Running predictions
        self.dangoURL = "https://emoji.getdango.com/api/emoji?q="

    def getPredictedEmojis(self, text):
        api_response = ''
        try:
            #turned out that Dango has stopped the api service.
            #we might just use the deepmoji model
            r = requests.get("https://emoji.getdango.com/api/emoji",
                             params={"q": text})
            api_response = json.loads(r.text)
        except:
            pass

        if 'results' in api_response:
            res = [item['text'] for item in api_response['results']]
            if len(res) < 5:
                extraemojis = self.localPredict(text)
                for k in extraemojis:
                    if k not in res:
                        res.append(k)
                    if len(res) == 5:
                        return res
            else:
                return res[:5]
        else:
            return self.localPredict(text)

    def localPredict(self, text):
        tokenized, _, _ = self.st.tokenize_sentences([text.lower()])
        # Get sentence probability
        prob = self.model(tokenized)[0]
        # Top emoji id
        emoji_ids = top_elements(prob, 6)
        np.setdiff1d(emoji_ids, [42])
        if len(emoji_ids) > 5:
            emoji_ids = emoji_ids[:5]
        # map to emojis
        emojis = map(lambda x: EMOJIS[x], emoji_ids)
        return emoji.emojize(' '.join(emojis), use_aliases=True).split()

Example #10

0

Show file

File: test_finetuning.py Project: cclauss/torchMoji

def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        'I love mom\'s cooking',
        'I love how you never reply back..',
        'I love cruising with my homies',
        'I love messing with yo mind!!',
        'I love you and now you\'re just gone..',
        'This is shit',
        'This is the shit'
    ]

    expected = [
        np.array([36,  4,  8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31,  6, 30, 15, 13]),
        np.array([54, 44,  9, 50, 49]),
        np.array([46,  5, 27, 35, 34]),
        np.array([55, 32, 27,  1, 37]),
        np.array([48, 11,  6, 31,  9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokens, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    prob = model(tokens)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(list(prob)):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])

Example #11

0

Show file

def test():
    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    if __name__ == "__main__":
        argparser = argparse.ArgumentParser()
        argparser.add_argument('--text',
                               type=str,
                               required=True,
                               help="Input text to emojize")
        argparser.add_argument('--maxlen',
                               type=int,
                               default=30,
                               help="Max length of input text")
        args = argparser.parse_args()

        # Tokenizing using dictionary
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        st = SentenceTokenizer(vocabulary, args.maxlen)

        # Loading model
        model = torchmoji_emojis(PRETRAINED_PATH)
        # Running predictions
        tokenized, _, _ = st.tokenize_sentences([args.text])
        # Get sentence probability
        prob = model(tokenized)[0]

        # Top emoji id
        emoji_ids = top_elements(prob, 5)

        # map to emojis
        emojis = map(lambda x: EMOJIS[x], emoji_ids)

        print(
            emoji.emojize("{} {}".format(args.text, ' '.join(emojis)),
                          use_aliases=True))

Example #12

0

Show file

def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    expected = [
        np.array([36, 4, 8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31, 6, 30, 15, 13]),
        np.array([54, 44, 9, 50, 49]),
        np.array([46, 5, 27, 35, 34]),
        np.array([55, 32, 27, 1, 37]),
        np.array([48, 11, 6, 31, 9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokens, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    prob = model(tokens)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(list(prob)):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])

Example #13

0

Show file

File: test_sentence_tokenizer.py Project: AdarshKumar712/PPCM

def test_dataset_split_explicit():
    """ Dataset is split according to given indices
    """
    split_parameter = [train_ind, val_ind, test_ind]
    st = SentenceTokenizer(vocab, 30)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    result, result_dicts, added = st.split_train_val_test(sentences,
                                                          dicts,
                                                          split_parameter,
                                                          extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    tokenized = tokenized

    for i, sentence in enumerate(sentences):
        if i in train_ind:
            assert tokenized[i] in train
            assert dicts[i] in train_dicts
        elif i in val_ind:
            assert tokenized[i] in val
            assert dicts[i] in val_dicts
        elif i in test_ind:
            assert tokenized[i] in test
            assert dicts[i] in test_dicts

    assert len(train) == len(train_ind)
    assert len(val) == len(val_ind)
    assert len(test) == len(test_ind)
    assert len(train_dicts) == len(train_ind)
    assert len(val_dicts) == len(val_ind)
    assert len(test_dicts) == len(test_ind)

Example #14

0

Show file

File: text_response.py Project: deCourier/Facebook-Conversational-AI-Hackathon-2019

def text_to_emoji(text, maxlen):
    # Tokenizing using dictionary
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)
    # Running predictions
    tokenized, _, _ = st.tokenize_sentences([text])
    # Get sentence probability
    prob = model(tokenized)[0]

    # Top emoji id
    emoji_ids = top_elements(prob, 5)

    # map to emojis
    emojis = map(lambda x: EMOJIS[x], emoji_ids)

    print(
        emoji.emojize("{} {}".format(text, ' '.join(emojis)),
                      use_aliases=True))

Example #15

0

Show file

class Emoji(runner.Runner):
    name = "emoji"

    def __init__(self, counter, name, max_concurrent_queries):
        super().__init__(counter, name, max_concurrent_queries)

        sys.path.append(os.path.join(self.data_dir, "tacotron2-PPP-1.3.0"))
        from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
        from torchmoji.model_def import torchmoji_emojis, torchmoji_feature_encoding
        from torchmoji.sentence_tokenizer import SentenceTokenizer

        self.log.debug("Loading model")

        with open(VOCAB_PATH, "r") as f:
            vocabulary = json.load(f)

        with torch.no_grad():
            self.tm_sentence_tokenizer = SentenceTokenizer(
                vocabulary, MAX_LEN, ignore_sentences_with_only_custom=True
            )
            self.tm_torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH)
            self.tm_model = torchmoji_emojis(PRETRAINED_PATH)

        self.log.debug("Model loaded")

    async def func(self, request, **kwargs):
        text_batch = [self.normalize_input(request)]
        text_batch = [
            text.replace('"', "") for text in text_batch
        ]  # remove quotes from text
        tokenized, _, _ = self.tm_sentence_tokenizer.tokenize_sentences(text_batch)
        prob = self.tm_model(tokenized)[0]
        emoji_ids = top_elements(prob, 3)
        emojis = map(lambda x: EMOJIS[x], emoji_ids)
        emoji_score = [emoji.emojize(e, use_aliases=True) for e in emojis]
        return emoji_score

Example #16

0

Show file

File: emoji_predictor.py Project: AmiraMohamd95/DeepMoji

from __future__ import print_function, division, unicode_literals
import json
import numpy as np
from torchmoji.sentence_tokenizer import SentenceTokenizer
from torchmoji.model_def import torchmoji_emojis
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
import sys

print("start the file")
message = sys.argv[1] + ""


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


maxlen = 30
print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
model = torchmoji_emojis(PRETRAINED_PATH)
print(model)
tokenized, _, _ = st.tokenize_sentences([message])
prob = model(tokenized)
ind_top = top_elements(prob[0], 5)
print(ind_top)

Example #17

0

Show file

File: Deepmoji_fullproba.py Project: msomashe/cl_lda

    columns=['sentiment', 'id', 'date', 'query', 'screen_name', 'text'] +
    list(emoji_codes))
#Twitter sample
#df_full = pd.DataFrame(columns=['sentiment', 'text']+list(emoji_codes))
#runn in a loops of 5000 to avoid overusing computational resources
chunk_size = 5000
i = 1000
chunk_size = 1000
for i in range(chunk_size, len(df) + chunk_size, chunk_size):
    if (i > len(df)):
        i = len(df)
        chunk_size = len(df) % chunk_size
    #grab the subset of documents
    documents = list(df.text[i - chunk_size:i])
    #tokenize them
    tokens, infos, stats = st.tokenize_sentences(documents)
    #fit the probabilities
    prob = model(tokens)
    #append the results to df_full
    df_full = df_full.append(df[i - chunk_size:i].join(
        pd.DataFrame(prob, columns=emoji_codes, index=range(i - chunk_size,
                                                            i))))
    #print the status
    print("finished processing", i, "documents")

#confirm shape
df_full.shape

#save as csv
filename = '/Users/ikennedy/Work/UW/Class/SICSS/SICSS R/emoji_tweet_sample.csv'
df_full.to_csv(filename)

Example #18

0

Show file

File: spanish_tweets.py Project: JonathanZailer/AML

with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
# Use torchMoji to encode texts into emotional feature vectors.
data = {
    'texts': [],
    'batch_size': batch_size,
    'labels': [],
    'maxlen': maxlen,
    'added': 0
}
for i, df in enumerate(tables):
    print(tables_meaning[i])
    with open("data/{}_eng.json".format(tables_meaning[i]), "r") as f:
        translated = json.load(f)
        tokenized, _, _ = st.tokenize_sentences(translated)
        data['texts'].append(tokenized)

# add labels
feelings = [
    'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism',
    'pessimism', 'sadness', 'surprise', 'trust'
]
for i, df in enumerate(tables):
    tmp_data = data['texts'][i]
    new_data = []
    labels = []
    for ri in range(len(df)):
        # labels.append([df.iloc[ri][f] == '1' for fi, f in enumerate(feelings)])
        not_find = True
        for fi, f in enumerate(feelings):

Example #19

0

Show file

    return ind[np.argsort(array[ind])][::-1]


maxlen = 30

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, maxlen)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = torchmoji_emojis(PRETRAINED_PATH)
print(model)
print('Running predictions.')
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
prob = model(tokenized)

for prob in [prob]:
    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the torchMoji repo.
    print('Writing results to {}'.format(OUTPUT_PATH))
    scores = []
    for i, t in enumerate(TEST_SENTENCES):
        t_tokens = tokenized[i]
        t_score = [t]
        t_prob = prob[i]
        ind_top = top_elements(t_prob, 5)
        t_score.append(sum(t_prob[ind_top]))
        t_score.extend(ind_top)

Example #20

0

Show file

File: LangFeatures.py Project: faizanahemad/facebook-hateful-memes

class LangFeaturesModel(Fasttext1DCNNModel):
    def __init__(self,
                 classifier_dims,
                 num_classes,
                 embedding_dims,
                 gaussian_noise,
                 dropout,
                 internal_dims,
                 n_layers,
                 featurizer,
                 final_layer_builder,
                 n_tokens_in=64,
                 n_tokens_out=16,
                 capabilities2dims=dict(),
                 use_as_super=False,
                 **kwargs):
        super(LangFeaturesModel, self).__init__(classifier_dims,
                                                num_classes,
                                                embedding_dims,
                                                gaussian_noise,
                                                dropout,
                                                internal_dims,
                                                n_layers,
                                                featurizer,
                                                final_layer_builder,
                                                n_tokens_in,
                                                n_tokens_out,
                                                use_as_super=True,
                                                **kwargs)
        assert "capabilities" in kwargs
        capabilities = kwargs["capabilities"]
        kwargs[
            "rake_dims"] = kwargs["rake_dims"] if "rake_dims" in kwargs else 32
        kwargs[
            "yake_dims"] = kwargs["yake_dims"] if "yake_dims" in kwargs else 32
        assert "key_phrases" not in capabilities or (
            "key_phrases" in capabilities and "spacy" in capabilities)
        use_layer_norm = kwargs[
            "use_layer_norm"] if "use_layer_norm" in kwargs else False
        self.capabilities = capabilities
        embedding_dim = 8
        cap_to_dim_map = {
            "spacy": 128,
            "snlp": 32,
            "key_phrases": 64,
            "nltk": 192,
            "full_view": 64,
            "tmoji": 32,
            "ibm_max": 16,
            "gensim": 256,
            "fasttext_crawl": 256
        }
        cap_to_dim_map.update(capabilities2dims)
        all_dims = sum([cap_to_dim_map[c] for c in capabilities])
        self.cap_to_dim_map = cap_to_dim_map
        self.all_dims = all_dims

        if "spacy" in capabilities:
            tr = pytextrank.TextRank(token_lookback=7)
            self.nlp = spacy.load("en_core_web_lg", disable=[])
            self.nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
            spacy_in_dims = (96 * 2) + (11 * embedding_dim) + 2
            self.spacy_nn = ExpandContract(spacy_in_dims,
                                           cap_to_dim_map["spacy"],
                                           dropout,
                                           use_layer_norm=use_layer_norm,
                                           groups=(2, 4))

        if "fasttext_crawl" in capabilities:
            self.bpe = BPEmb(dim=200)
            self.cngram = CharNGram()
            fasttext_crawl_file = kwargs[
                "fasttext_crawl_file"] if "fasttext_crawl_file" in kwargs else "crawl-300d-2M-subword.bin"
            self.crawl = fasttext.load_model(fasttext_crawl_file)
            self.crawl_nn = ExpandContract(200 + 300 + 100,
                                           cap_to_dim_map["fasttext_crawl"],
                                           dropout,
                                           use_layer_norm=use_layer_norm,
                                           groups=(4, 4))

        if "gensim" in capabilities:
            gensim = [
                api.load("glove-twitter-50"),
                api.load("glove-wiki-gigaword-50"),
                api.load("word2vec-google-news-300"),
                api.load("conceptnet-numberbatch-17-06-300")
            ]
            self.gensim = gensim
            self.gensim_nn = ExpandContract(400,
                                            cap_to_dim_map["gensim"],
                                            dropout,
                                            use_layer_norm=use_layer_norm,
                                            groups=(4, 4))

        if "full_view" in capabilities:
            full_sent_in_dims = 300
            self.full_sent_nn = ExpandContract(full_sent_in_dims,
                                               cap_to_dim_map["full_view"],
                                               dropout,
                                               use_layer_norm=use_layer_norm,
                                               groups=(4, 4))

        if "snlp" in capabilities:
            import stanza
            self.snlp = stanza.Pipeline(
                'en',
                processors='tokenize,pos,lemma,depparse,ner',
                use_gpu=False,
                pos_batch_size=2048)
            self.snlp_nn = ExpandContract(embedding_dim * 5,
                                          cap_to_dim_map["snlp"],
                                          dropout,
                                          use_layer_norm=use_layer_norm)
        if "key_phrases" in capabilities:
            import yake
            self.kw_extractor = yake.KeywordExtractor(lan="en",
                                                      n=3,
                                                      dedupLim=0.9,
                                                      dedupFunc='seqm',
                                                      windowsSize=3,
                                                      top=10,
                                                      features=None)

            self.key_occ_cnt_pytextrank = nn.Embedding(8, embedding_dim)
            nn.init.normal_(self.key_occ_cnt_pytextrank.weight,
                            std=1 / embedding_dim)
            self.key_wc_pytextrank = nn.Embedding(4, embedding_dim)
            nn.init.normal_(self.key_wc_pytextrank.weight,
                            std=1 / embedding_dim)

            yake_dims = kwargs["yake_dims"] if "yake_dims" in kwargs else 32
            self.yake_dims = yake_dims
            self.yake_nn = ExpandContract(300,
                                          yake_dims,
                                          dropout,
                                          use_layer_norm=use_layer_norm,
                                          groups=(2, 2))

            try:
                from multi_rake import Rake
                rake_dims = kwargs["rake_dims"] if "rake_dims" in kwargs else 32
                self.rake_dims = rake_dims
                self.rake_nn = ExpandContract(300,
                                              rake_dims,
                                              dropout,
                                              use_layer_norm=use_layer_norm,
                                              groups=(2, 2))
                self.rake = Rake(language_code="en")
                keyphrases_dim = 2 * embedding_dim + rake_dims + yake_dims
            except:
                self.rake = None
                keyphrases_dim = 2 * embedding_dim + yake_dims
            self.keyphrase_nn = ExpandContract(keyphrases_dim,
                                               cap_to_dim_map["key_phrases"],
                                               dropout,
                                               use_layer_norm=use_layer_norm,
                                               groups=(4, 4))

        fasttext_file = kwargs[
            "fasttext_file"] if "fasttext_file" in kwargs else "wiki-news-300d-1M-subword.bin"
        if not set(capabilities).isdisjoint(
            {"key_phrases", "full_view", "nltk"}):
            self.text_model = fasttext.load_model(fasttext_file)

        self.pdict = get_all_tags()
        self.tag_em = nn.Embedding(len(self.pdict) + 1, embedding_dim)
        nn.init.normal_(self.tag_em.weight, std=1 / embedding_dim)

        self.sw_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.sw_em.weight, std=1 / embedding_dim)

        self.sent_start_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.sent_start_em.weight, std=1 / embedding_dim)

        self.is_oov_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.is_oov_em.weight, std=1 / embedding_dim)

        self.has_digit_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.has_digit_em.weight, std=1 / embedding_dim)

        self.is_mask_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.is_mask_em.weight, std=1 / embedding_dim)

        self.w_len = nn.Embedding(16, embedding_dim)
        nn.init.normal_(self.w_len.weight, std=1 / embedding_dim)

        self.wc_emb = nn.Embedding(16, embedding_dim)
        nn.init.normal_(self.wc_emb.weight, std=1 / embedding_dim)

        if "nltk" in capabilities:
            import rake_nltk
            from textblob import TextBlob
            from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer
            self.stop_words = set(stopwords.words('english'))
            self.rake_nltk = rake_nltk.Rake()
            self.key_wc_rake_nltk = nn.Embedding(4, embedding_dim)
            nn.init.normal_(self.key_wc_rake_nltk.weight,
                            std=1 / embedding_dim)
            self.nltk_sid = SentimentIntensityAnalyzer()
            self.vader_sid = VaderSentimentIntensityAnalyzer()
            in_dims = 310 + 5 * embedding_dim
            self.nltk_nn = ExpandContract(in_dims,
                                          cap_to_dim_map["nltk"],
                                          dropout,
                                          use_layer_norm=use_layer_norm,
                                          groups=(2, 4))

        if "ibm_max" in capabilities:
            from ..external import ModelWrapper
            self.ibm_max = ModelWrapper()
            for p in self.ibm_max.model.parameters():
                p.requires_grad = False
            self.ibm_nn = ExpandContract(6,
                                         cap_to_dim_map["ibm_max"],
                                         dropout,
                                         use_layer_norm=use_layer_norm,
                                         groups=(1, 1))

        if "tmoji" in capabilities:
            from torchmoji.sentence_tokenizer import SentenceTokenizer
            from torchmoji.model_def import torchmoji_emojis
            from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
            with open(VOCAB_PATH, 'r') as f:
                maxlen = self.n_tokens_in
                self.vocabulary = json.load(f)
                self.st = SentenceTokenizer(self.vocabulary, maxlen)
                self.tmoji = torchmoji_emojis(PRETRAINED_PATH)
                for p in self.tmoji.parameters():
                    p.requires_grad = False
            self.tm_nn = ExpandContract(64,
                                        cap_to_dim_map["tmoji"],
                                        dropout,
                                        use_layer_norm=use_layer_norm,
                                        groups=(1, 1))

        self.contract_nn = ExpandContract(self.all_dims,
                                          embedding_dims,
                                          dropout,
                                          use_layer_norm=True,
                                          unit_norm=False,
                                          groups=(4, 4))
        if not use_as_super:
            if featurizer == "cnn":
                self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)
            elif featurizer == "gru":
                self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims,
                                                n_tokens_out, classifier_dims,
                                                internal_dims, n_layers,
                                                gaussian_noise, dropout)
            elif featurizer == "basic":
                self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)

            elif featurizer == "transformer":
                self.attention_drop_proba = kwargs[
                    "attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0
                n_encoders = kwargs.pop("n_encoders", n_layers)
                n_decoders = kwargs.pop("n_decoders", n_layers)
                self.featurizer = TransformerFeaturizer(
                    n_tokens_in, embedding_dims, n_tokens_out, classifier_dims,
                    internal_dims, n_encoders, n_decoders, gaussian_noise,
                    dropout, self.attention_drop_proba)
            else:
                raise NotImplementedError()

            self.final_layer = final_layer_builder(classifier_dims,
                                                   n_tokens_out, num_classes,
                                                   dropout, **kwargs)
        if "stored_model" in kwargs:
            load_stored_params(self, kwargs["stored_model"])
        self.reg_layers = get_regularization_layers(self)

    def get_one_crawl_sentence_vector(self, tm, sentence):
        tokens = fasttext.tokenize(sentence)
        if isinstance(tm, fasttext.FastText._FastText):
            result = torch.tensor([tm[t] for t in tokens])
        elif isinstance(tm, torchnlp.word_to_vector.char_n_gram.CharNGram):
            result = torch.stack([tm[t] for t in tokens])
        else:
            result = tm[tokens]
        return result

    def get__crawl_word_vectors(self, texts: List[str]):
        bpe = self.bpe
        cngram = self.cngram
        tm = self.crawl
        n_tokens_in = self.n_tokens_in
        result = stack_and_pad_tensors(
            [self.get_one_crawl_sentence_vector(tm, text) for text in texts],
            n_tokens_in)
        res2 = stack_and_pad_tensors(
            [self.get_one_crawl_sentence_vector(bpe, text) for text in texts],
            n_tokens_in)
        res3 = stack_and_pad_tensors([
            self.get_one_crawl_sentence_vector(cngram, text) for text in texts
        ], n_tokens_in)
        result = torch.cat([result, res2, res3], 2)
        result = result.to(get_device())
        result = self.crawl_nn(result)
        return result

    def get_torchmoji_probas(self, texts: List[str]):
        tokenized, _, _ = self.st.tokenize_sentences(texts)
        with torch.no_grad():
            prob = self.tmoji(tokenized)
        return torch.tensor(prob).to(get_device())

    def get_one_sentence_vector(self, m, text):
        vs = min(m.vector_size, 150)
        zeros = np.zeros(vs)
        result = [
            m[t][:150] if t in m else zeros for t in fasttext.tokenize(text)
        ]
        return torch.tensor(result, dtype=float)

    def get_gensim_word_vectors(self, texts: List[str]):
        n_tokens_in = self.n_tokens_in
        result = []
        for m in self.gensim:
            r = stack_and_pad_tensors(
                [self.get_one_sentence_vector(m, text) for text in texts],
                n_tokens_in)
            result.append(r)
        result = [r.float() for r in result]
        result = torch.cat(result, 2)
        result = result.to(get_device())
        result = self.gensim_nn(result)
        return result

    def get_nltk_vectors(self, texts: List[str]):
        # https://gist.github.com/japerk/1909413
        from textblob import TextBlob
        sid = self.nltk_sid
        vsid = self.vader_sid
        pdict = self.pdict
        n_tokens_in = self.n_tokens_in
        rake = self.rake_nltk
        nltk_texts = [fasttext.tokenize(text) for text in texts]
        textblob_sentiments = [[
            sentiment.polarity, sentiment.subjectivity
        ] for sentiment in [TextBlob(text).sentiment for text in texts]]
        textblob_sentiments = torch.tensor(textblob_sentiments).unsqueeze(
            1).expand(len(texts), n_tokens_in, 2)
        textblob_sentiments = textblob_sentiments.to(get_device())

        mask = stack_and_pad_tensors(
            list(map(lambda x: torch.ones(len(x), dtype=int), nltk_texts)),
            n_tokens_in)
        mask = mask.to(get_device())
        mask = self.is_mask_em(mask)
        has_digit = stack_and_pad_tensors(
            list(
                map(lambda x: torch.tensor([has_digits(str(t)) for t in x]),
                    nltk_texts)), n_tokens_in)
        has_digit = has_digit.to(get_device())
        has_digit = self.has_digit_em(has_digit)

        m = self.text_model
        nltk_emb = stack_and_pad_tensors(
            [torch.tensor([m[t] for t in sent]) for sent in nltk_texts],
            n_tokens_in)  # if t in m else np.zeros(m.vector_size)
        nltk_emb = nltk_emb.to(get_device())
        sid_vec = torch.tensor(
            [list(sid.polarity_scores(t).values()) for t in texts])
        sid_vec = sid_vec.unsqueeze(1).expand(len(texts), n_tokens_in,
                                              sid_vec.size(1))
        sid_vec = sid_vec.to(get_device())
        vsid_vec = torch.tensor(
            [list(vsid.polarity_scores(t).values()) for t in texts])
        vsid_vec = vsid_vec.unsqueeze(1).expand(len(texts), n_tokens_in,
                                                vsid_vec.size(1))
        vsid_vec = vsid_vec.to(get_device())
        conlltags = [[
            ptags for ptags in nltk.tree2conlltags(ne_chunk(pos_tag(x)))
        ] for x in nltk_texts]

        pos = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [pdict[tag.lower()] for token, tag, ne in x]),
                    conlltags)), n_tokens_in)
        pos = pos.to(get_device())
        pos_emb = self.tag_em(pos)
        ner = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor([
                        pdict[ne.lower().split("-")[-1]]
                        for token, tag, ne in x
                    ]), conlltags)), n_tokens_in)
        ner = ner.to(get_device())
        ner_emb = self.tag_em(ner)

        phrases = [get_rake_nltk_phrases(rake, t) for t in texts]

        key_wc_rake_nltk = [
            get_rake_nltk_wc(tokens, phr)
            for tokens, phr in zip(nltk_texts, phrases)
        ]
        key_wc_rake_nltk = stack_and_pad_tensors(key_wc_rake_nltk,
                                                 self.n_tokens_in)
        key_wc_rake_nltk = key_wc_rake_nltk.to(get_device())
        nltk_rake_vectors = self.key_wc_rake_nltk(key_wc_rake_nltk)

        result = torch.cat([
            vsid_vec, nltk_emb, textblob_sentiments, pos_emb, ner_emb,
            nltk_rake_vectors, sid_vec, mask, has_digit
        ], 2)
        result = result.to(get_device())
        result = self.nltk_nn(result)
        return result

    def get_sentence_vector(self, texts: List[str]):
        tm = self.text_model
        n_tokens_in = self.n_tokens_in
        result = torch.tensor([tm.get_sentence_vector(text) for text in texts])
        result = result.to(get_device())
        result = self.full_sent_nn(result)
        result = result.unsqueeze(1).expand(len(texts), n_tokens_in,
                                            result.size(1))
        return result

    def get_stanford_nlp_vectors(self, texts: List[str]):
        snlp = self.snlp
        pdict = self.pdict
        n_tokens_in = self.n_tokens_in
        docs = [
            list(
                map(lambda x: dict(**x.to_dict()[0], ner=x.ner),
                    snlp(doc).iter_tokens())) for doc in texts
        ]

        upos = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [pdict[token["upos"].lower()] for token in x]), docs)),
            n_tokens_in)
        upos_emb = self.tag_em(upos)

        xpos = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [pdict[token["xpos"].lower()] for token in x]), docs)),
            n_tokens_in)
        xpos_emb = self.tag_em(xpos)

        deprel = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor([
                        pdict[token["deprel"].split(":")[0].lower()]
                        for token in x
                    ]), docs)), n_tokens_in)
        deprel_emb = self.tag_em(deprel)

        deprel2 = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor([
                        pdict[token["deprel"].split(":")[1].lower()]
                        if ":" in token["deprel"] else 0 for token in x
                    ]), docs)), n_tokens_in)
        deprel_emb2 = self.tag_em(deprel2)

        sner = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor([
                        pdict[token["ner"].split("-")[1].lower()]
                        if "-" in token["ner"] else 0 for token in x
                    ]), docs)), n_tokens_in)
        sner_emb = self.tag_em(sner)

        result = torch.cat(
            [upos_emb, xpos_emb, deprel_emb, sner_emb, deprel_emb2], 2)
        result = result.to(get_device())
        result = self.snlp_nn(result)
        return result

    def get_spacy_nlp_vectors(self, texts: List[str]):
        pdict = self.pdict
        nlp = self.nlp
        n_tokens_in = self.n_tokens_in
        with torch.no_grad():
            spacy_texts = list(nlp.pipe(texts, n_process=1))
            text_tensors = list(
                map(lambda x: torch.tensor(x.tensor), spacy_texts))
            text_tensors = stack_and_pad_tensors(text_tensors, n_tokens_in)
            head_tensors = stack_and_pad_tensors(
                list(
                    map(lambda x: torch.tensor([t.head.tensor for t in x]),
                        spacy_texts)), n_tokens_in)
            text_tensors = text_tensors.to(get_device())
            head_tensors = head_tensors.to(get_device())
        wl = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor([len(token) - 1
                                            for token in x]).clamp(0, 15),
                    spacy_texts)), n_tokens_in)
        wl = wl.to(get_device())
        wl_emb = self.w_len(wl)
        wc = (torch.tensor(list(map(len, spacy_texts))) //
              10).long().unsqueeze(1).expand(len(texts), n_tokens_in)
        wc = wc.to(get_device())
        wc_emb = self.wc_emb(wc)

        mask = stack_and_pad_tensors(
            list(map(lambda x: torch.ones(len(x), dtype=int), spacy_texts)),
            n_tokens_in)
        mask = mask.to(get_device())
        mask = self.is_mask_em(mask)
        has_digit = stack_and_pad_tensors(
            list(
                map(lambda x: torch.tensor([has_digits(str(t)) for t in x]),
                    spacy_texts)), n_tokens_in)
        has_digit = has_digit.to(get_device())
        has_digit = self.has_digit_em(has_digit)

        pos = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [pdict[token.pos_.lower()] for token in x]),
                    spacy_texts)), n_tokens_in)
        pos = pos.to(get_device())
        pos_emb = self.tag_em(pos)
        tag = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [pdict[token.tag_.lower()] for token in x]),
                    spacy_texts)), n_tokens_in)
        tag = tag.to(get_device())
        tag_emb = self.tag_em(tag)
        dep = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [pdict[token.dep_.lower()] for token in x]),
                    spacy_texts)), n_tokens_in)
        dep = dep.to(get_device())
        dep_emb = self.tag_em(dep)
        sw = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor([int(token.is_stop)
                                            for token in x]), spacy_texts)),
            n_tokens_in)
        sw = sw.to(get_device())
        sw_emb = self.sw_em(sw)
        ner = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [pdict[token.ent_type_.lower()] for token in x]),
                    spacy_texts)), n_tokens_in)
        ner = ner.to(get_device())
        ner_emb = self.tag_em(ner)

        is_oov = stack_and_pad_tensors(
            list(
                map(lambda x: torch.tensor([int(token.is_oov) for token in x]),
                    spacy_texts)), n_tokens_in)
        is_oov = is_oov.to(get_device())
        is_oov_em = self.is_oov_em(is_oov)

        sent_start = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [int(token.sent_start) for token in x]), spacy_texts)),
            n_tokens_in)
        sent_start = sent_start.to(get_device())
        sent_start_em = self.sent_start_em(sent_start)

        head_dist = stack_and_pad_tensors(
            list(
                map(
                    lambda x: torch.tensor(
                        [float(token.idx - token.head.idx) for token in x]),
                    spacy_texts)), n_tokens_in)
        head_dist = head_dist.to(get_device())
        head_dist = head_dist.unsqueeze(2).expand(len(texts), n_tokens_in, 2)

        result = torch.cat([
            text_tensors, pos_emb, tag_emb, dep_emb, sw_emb, ner_emb, wl_emb,
            wc_emb, mask, has_digit, is_oov_em, sent_start_em, head_dist,
            head_tensors
        ], 2)
        result = result.to(get_device())
        result = self.spacy_nn(result)
        return result, spacy_texts

    def get_ibm_max(self, texts: List[str]):
        with torch.no_grad():
            result = self.ibm_max.predict(texts)
        result = result.to(get_device())
        result = self.ibm_nn(result)
        result = result.unsqueeze(1).expand(len(texts), self.n_tokens_in,
                                            result.size(1))
        return result

    def get_tmoji(self, texts: List[str]):
        with torch.no_grad():
            tm_probas = self.get_torchmoji_probas(texts)
        tm_probas = self.tm_nn(tm_probas)
        tm_probas = tm_probas.unsqueeze(1).expand(len(texts), self.n_tokens_in,
                                                  tm_probas.size(1))
        return tm_probas

    def get_keyphrases(self, texts: List[str], spacy_texts):
        tm = self.text_model
        results = [get_pytextrank_wc_keylen(i) for i in spacy_texts]
        key_wc_pytextrank, key_occ_cnt_pytextrank = zip(*results)
        key_wc_pytextrank = stack_and_pad_tensors(key_wc_pytextrank,
                                                  self.n_tokens_in)
        key_occ_cnt_pytextrank = stack_and_pad_tensors(key_occ_cnt_pytextrank,
                                                       self.n_tokens_in)
        key_occ_cnt_pytextrank = key_occ_cnt_pytextrank.to(get_device())
        key_wc_pytextrank = key_wc_pytextrank.to(get_device())
        pytextrank_vectors = torch.cat(
            (self.key_wc_pytextrank(key_wc_pytextrank),
             self.key_occ_cnt_pytextrank(key_occ_cnt_pytextrank)), 2)  # 16
        pytextrank_vectors = pytextrank_vectors.to(get_device())
        yake_ke = self.kw_extractor
        yake_embs = [[
            tm.get_sentence_vector(s)
            for s in map(itemgetter(0), yake_ke.extract_keywords(t))
        ] if has_words(t) else [np.zeros(300)] for t in texts]
        yake_embs = torch.tensor([
            np.average(
                yk, axis=0, weights=softmax(list(range(
                    len(yk), 0, -1)))).astype(np.float32)
            if len(yk) > 0 else np.zeros(tm.get_dimension(), dtype=np.float32)
            for yk in yake_embs
        ])
        yake_embs = yake_embs.to(get_device())
        yake_embs = self.yake_nn(yake_embs).unsqueeze(1).expand(
            len(texts), self.n_tokens_in, self.yake_dims)

        if self.rake is not None:
            rake_ke = self.rake
            rake_embs = [[
                tm.get_sentence_vector(s)
                for s in map(itemgetter(0), rake_ke.apply(t))
            ] if has_words(t) else [np.zeros(300)] for t in texts]
            rake_embs = torch.tensor([
                np.average(
                    rk, axis=0, weights=softmax(list(range(
                        len(rk), 0, -1)))).astype(np.float32) if len(rk) > 0
                else np.zeros(tm.get_dimension(), dtype=np.float32)
                for rk in rake_embs
            ])
            rake_embs = rake_embs.to(get_device())
            rake_embs = self.rake_nn(rake_embs).unsqueeze(1).expand(
                len(texts), self.n_tokens_in, self.rake_dims)
            result = torch.cat([pytextrank_vectors, yake_embs, rake_embs], 2)
        else:
            result = torch.cat([pytextrank_vectors, yake_embs], 2)
        result = result.to(get_device())
        result = self.keyphrase_nn(result)
        return result

    def get_word_vectors(self, texts: List[str]):
        cap_method = {
            "snlp": self.get_stanford_nlp_vectors,
            "full_view": self.get_sentence_vector,
            "nltk": self.get_nltk_vectors,
            "ibm_max": self.get_ibm_max,
            "tmoji": self.get_tmoji,
            "gensim": self.get_gensim_word_vectors,
            "fasttext_crawl": self.get__crawl_word_vectors
        }
        results = []
        if "spacy" in self.capabilities:
            r, spt = self.get_spacy_nlp_vectors(texts)
            results.append(r)
        if "key_phrases" in self.capabilities and "spacy" in self.capabilities:
            r = self.get_keyphrases(texts, spt)
            results.append(r)
        for c in self.capabilities:
            if c == "spacy" or c == "key_phrases":
                continue
            r = cap_method[c](texts)
            results.append(r)
        clean_memory()
        result = torch.cat(results, 2)
        result = result.to(get_device())
        result = self.contract_nn(result)
        return result

Example #21

0

Show file

if __name__ == "__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--text', type=str, required=True, help="Input text to emojize")
    argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text")
    args = argparser.parse_args()

    # Tokenizing using dictionary
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, args.maxlen)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)
    # Running predictions
    tokenized, _, _ = st.tokenize_sentences([args.text])
    # Get sentence probability
    prob = model(tokenized)[0]

    # Top emoji id
    emoji_ids = top_elements(prob, 5)
    
    '''for emo in emoji_ids:
        print(emoji.emojize("emoji is : " + str(emo)+"  ,  "))
        print("-----------")'''

    # map to emojis
    emojis = map(lambda x: EMOJIS[x], emoji_ids)

    print("+++++++++++++++++++++++")
    #print(list(emojis))

Example #22

0

Show file

"""
Take a given list of sentences and turn it into a numpy array, where each
number corresponds to a word. Padding is used (number 0) to ensure fixed length
of sentences.
"""

from __future__ import print_function, unicode_literals
import example_helper
import json
from torchmoji.sentence_tokenizer import SentenceTokenizer

with open('../model/vocabulary.json', 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 30)
test_sentences = [
    '\u2014 -- \u203c !!\U0001F602',
    'Hello world!',
    'This is a sample tweet #example',
]

tokens, infos, stats = st.tokenize_sentences(test_sentences)

print(tokens)
print(infos)
print(stats)

Example #23

0

Show file

def text_to_emoji(input_text, max_length):
    #argparser = argparse.ArgumentParser()
    #argparser.add_argument('--text', type=str, required=True, help="Input text to emojize")
    #argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text")
    #args = argparser.parse_args()

    # Load dictionary for tokenizing
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    #print(f'vocabulary: {vocabulary}')

    with open(os.path.join(os.path.dirname(__file__),
                           './negative_words_parsed.txt'),
              'r',
              encoding='utf-8',
              errors='ignore') as negative_words_list:
        negative_words = list(negative_words_list)
        negative_words = [
            negative_word.rstrip('\n').lower()
            for negative_word in negative_words if negative_word != '\n'
        ]

    with open(
            os.path.join(os.path.dirname(__file__),
                         './positive_words_parsed.txt'),
            'r') as positive_words_list:
        positive_words = list(positive_words_list)
        positive_words = [
            positive_word.rstrip('\n').lower()
            for positive_word in positive_words if positive_word != '\n'
        ]

    st = SentenceTokenizer(vocabulary, max_length)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)

    # Running predictions
    # Determines the important words in the sentence
    tokenized, _, _ = st.tokenize_sentences([input_text])
    # Get sentence probability
    prob = model(tokenized)[0]

    # Top emotion id
    emotion_ids = top_elements(prob, 5)
    #print(f'top five emotion ids: {emotion_ids}')

    # map to emotions
    emotions = map(lambda x: EMOTIONS[x], emotion_ids)
    emotions = list(emotions)
    #print(f'emotions: {emotions}')
    user_feelings = positive_or_negative(emotions)
    #print(f'user_feelings: {user_feelings}')

    # Find the words that are contributing to the feeling
    user_positive_words = []
    user_negative_words = []

    for word in input_text.split(' '):
        if word in positive_words:
            user_positive_words.append(word)
        elif word in negative_words:
            user_negative_words.append(word)

    # map to emojis
    emojis = map(lambda x: EMOJIS[x], emotion_ids)
    #print(f'emojis: {list(emojis)}')
    main_vibe = list(emojis)[0]
    #print(f'main_vibe: {main_vibe}')

    json_to_bot = {
        "user_emotion": user_feelings,
        "positive": user_positive_words,
        "negative": user_negative_words,
        "main_vibe": main_vibe
    }
    return json.dumps(json_to_bot)

Example #24

0

Show file

class Dataset(data.Dataset):
    """Custom data.Dataset compatible with data.DataLoader."""
    def __init__(self,
                 data,
                 vocab,
                 hier=False,
                 elmo=False,
                 elmo_pre=None,
                 deepmoji=False):
        self.id, self.X, self.y = data
        self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

        if (self.y is None):
            self.y = None
        else:
            self.y = np.array(
                list(map(lambda label: self.emotion2label[label], self.y)))
        self.vocab = vocab
        self.num_total_seqs = len(self.X)
        self.tt = MyTokenizer()

        with open(VOCAB_PATH, 'r') as f:
            deepmoji_vocab = json.load(f)
        self.deepmoji_tt = SentenceTokenizer(deepmoji_vocab, 100)

        self.hier = hier
        self.elmo = elmo
        self.elmo_pre = elmo_pre  # pre-extracted elmo embeddings
        self.deepmoji = deepmoji

    def __getitem__(self, index):
        """Returns one data pair (source and target)."""
        ind = self.id[index]
        X_text = self.X[index]
        if (self.y is None): y = None
        else: y = self.y[index]
        if (self.hier):
            if self.elmo_pre is not None:
                f = lambda l, d: itemgetter(*l)(
                    d)  # get Tuple(values) with List[keys]
                X_1, X_2, X_3 = self.X[index][0], self.X[index][1], self.X[
                    index][2]
                return (*f([X_1.lower(), X_2.lower(),
                            X_3.lower()], self.elmo_pre), y, ind, X_text)
            X_1, X_2, X_3 = self.preprocess(self.X[index])
            return X_1, X_2, X_3, y, ind, X_text
        else:
            X = self.preprocess(self.X[index])
            return X, y, ind, X_text

    def __len__(self):
        return self.num_total_seqs

    def vectorize(self, sentence):
        sequence = []

        for word in self.tt.tokenize(clean_sentence(sentence)):
            if (word in text_to_emoji):
                word = text_to_emoji[word]

            # word = word.translate(None, string.punctuation)
            if constant.extra_prep:
                table = str.maketrans(
                    {key: None
                     for key in string.punctuation})
                word = word.translate(table)
                if len(word) == 0:
                    continue

                # the following code maybe not useful at all
                old_word = word
                if word not in constant.gen_vocabs:
                    word = word.lower()
                if word not in constant.gen_vocabs:
                    word = word[0].upper() + word[1:]
                if word not in constant.gen_vocabs:
                    word = word.upper()
                if old_word not in constant.gen_vocabs:
                    if word in constant.gen_vocabs:
                        print(">", old_word, word)
                if word not in constant.gen_vocabs:
                    word = old_word

            if word in self.vocab.word2index:
                sequence.append(self.vocab.word2index[word])
            else:
                sequence.append(constant.UNK_idx)
        return sequence

    def preprocess(self, arr):
        """Converts words to ids."""
        t1 = 'CLS ' + arr[0].lower()
        t2 = 'CLS ' + arr[1].lower()
        t3 = 'CLS ' + arr[2].lower()

        # print("preprocess deepmoji=", self.deepmoji)

        if self.elmo:
            t1 = self.tt.tokenize(clean_sentence(t1))
            t2 = self.tt.tokenize(clean_sentence(t2))
            t3 = self.tt.tokenize(clean_sentence(t3))

            if self.hier:
                return t1, t2, t3
            else:
                return np.concatenate((t1, t2, t3))
        elif self.deepmoji:
            t1, _, _ = self.deepmoji_tt.tokenize_sentences([t1])  #vectorize
            t2, _, _ = self.deepmoji_tt.tokenize_sentences([t2])
            t3, _, _ = self.deepmoji_tt.tokenize_sentences([t3])

            t1 = np.trim_zeros(t1.astype(np.int32)[0])
            t2 = np.trim_zeros(t2.astype(np.int32)[0])
            t3 = np.trim_zeros(t3.astype(np.int32)[0])

            if self.hier:
                return torch.LongTensor(t1), torch.LongTensor(
                    t2), torch.LongTensor(t3)
            else:
                return torch.LongTensor(t1 + t2 + t3)
        else:
            t1 = self.vectorize(t1)
            t2 = self.vectorize(t2)
            t3 = self.vectorize(t3)

            if self.hier:
                return torch.LongTensor(t1), torch.LongTensor(
                    t2), torch.LongTensor(t3)
            else:
                return torch.LongTensor(t1 + t2 + t3)

Example #25

0

Show file

File: tokenize_dataset.py Project: qq345736500/wh

"""
Take a given list of sentences and turn it into a numpy array, where each
number corresponds to a word. Padding is used (number 0) to ensure fixed length
of sentences.
"""

from __future__ import print_function, unicode_literals
import example_helper
import json
from torchmoji.sentence_tokenizer import SentenceTokenizer

with open('../model/vocabulary.json', 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 30)
test_sentences = [
    '\u2014 -- \u203c !!\U0001F602',
    'Hello world!',
    'This is a sample tweet #example',
    ]

tokens, infos, stats = st.tokenize_sentences(test_sentences)

print(tokens)
print(infos)
print(stats)

Example #26

0

Show file

File: text_emotions.py Project: deCourier/Facebook-Conversational-AI-Hackathon-2019

    print(f'test_sentences length: {len(test_sentences[0])}')

    # Tokenizing using dictionary
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    #st = SentenceTokenizer(vocabulary, args.maxlen)
    st = SentenceTokenizer(vocabulary, 500)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)
    # Running predictions
    # Determines the important words in the sentence
    #tokenized, _, _ = st.tokenize_sentences([args.text])
    tokenized, _, _ = st.tokenize_sentences(test_sentences[0])
    #print(f'tokenized words: {tokenized}')
    # Get sentence probability
    #prob = model(tokenized)[0]
    print(f'tokenized: {tokenized}')
    prob = model(tokenized)

    for prob in [prob]:
        # Find top emojis for each sentence. Emoji ids (0-63)
        # correspond to the mapping in emoji_overview.png
        # at the root of the torchMoji repo.
        #print(f'prob:{prob}')
        print('Writing results to {}'.format(OUTPUT_PATH))
        scores = []
        print(f'prob: {prob}')
        for i, t in enumerate(test_sentences[0]):

Example #27

0

Show file

File: encode_texts.py Project: cclauss/torchMoji

TEST_SENTENCES = ['I love mom\'s cooking',
                  'I love how you never reply back..',
                  'I love cruising with my homies',
                  'I love messing with yo mind!!',
                  'I love you and now you\'re just gone..',
                  'This is shit',
                  'This is the shit']

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = torchmoji_feature_encoding(PRETRAINED_PATH)
print(model)

print('Encoding texts..')
encoding = model(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print(encoding[0,:5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.