Ejemplo n.º 1
0
def test_torchmoji_return_attention():
    seq_tensor = np.array([[1]])
    # test the output of the normal model
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    # check correct number of outputs
    assert len(model(seq_tensor)) == 1
    # repeat above described tests when returning attention weights
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH,
                             return_attention=True)
    assert len(model(seq_tensor)) == 2
Ejemplo n.º 2
0
    def __init__(self):
        """
        Ctor.
        """

        # Automatically download weights
        if not os.path.isfile(PRETRAINED_PATH):
            os.system(
                "(cd torchMoji && python scripts/download_weights_yes.py)")

        # Instanciate a pytorch model
        self._model = torchmoji_emojis(weight_path=PRETRAINED_PATH)

        # Load vocabulary
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        # Create tokenizer to split a sentence into words
        self._st = SentenceTokenizer(vocabulary, self._max_message_len_words)

        # Load a mapping in neural network prediction to smileys
        emoji_codes_path = os.path.join(ROOT_PATH, "data", "emoji_codes.json")
        with open(emoji_codes_path, 'r') as f:
            self._emoji_codes = json.load(f)

        # This is a reduction of 64 smileys into there "happiness" bool flag
        with open("sentiment.json", 'r') as f:
            self._sentiments = json.load(f)

        pass
Ejemplo n.º 3
0
    def __init__(self):
        with open(vocab_file_path, 'r') as f:
            vocabulary = json.load(f)

        max_sentence_length = 100
        self.st = SentenceTokenizer(vocabulary, max_sentence_length)
        self.model = torchmoji_emojis(model_weights_path)
Ejemplo n.º 4
0
    def __init__(self, max_sentence_length=30):
        # Tokenizing using the dictionary
        with open(VOCAB_PATH, 'r') as f:
            self.vocabulary = json.load(f)

        self.st = SentenceTokenizer(self.vocabulary, max_sentence_length)

        # Loading the model
        self.model = torchmoji_emojis(PRETRAINED_PATH)
Ejemplo n.º 5
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    expected = [
        np.array([36, 4, 8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31, 6, 30, 15, 13]),
        np.array([54, 44, 9, 50, 49]),
        np.array([46, 5, 27, 35, 34]),
        np.array([55, 32, 27, 1, 37]),
        np.array([48, 11, 6, 31, 9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokens, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    prob = model(tokens)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(list(prob)):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
Ejemplo n.º 6
0
    args = argparser.parse_args()
    sentence_probs = []
    retokenized_sentences = []
    output_path = os.path.join(os.path.dirname(args.filepath),
                               'sentence_emojis.pkl')
    retokenized_sentences_output_path = os.path.join(
        os.path.dirname(args.filepath), 'retokenized_sentences.pkl')

    # Tokenizing using dictionary
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, args.maxlen)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)

    sentences = load_pickle(args.filepath)
    # TODO: encode multiple sentences at once.
    #  Needs TorchMoji module to handle empty sentences and output equal probabilities
    # flattened_sentences = [utterance for conversation in sentences for utterance in conversation]
    # print('Encoding sentences ...')
    # flattened_tokenized, _, _ = st.tokenize_sentences(flattened_sentences)
    # flattened_probs = model(flattened_tokenized)
    # print('TorchMoji encoding done.')
    idx = 0
    for conversation in sentences:
        idx += 1
        conversation_probs = []
        conversation_retokenized = []
        for sentence in conversation: