Python SentenceTokenizerの例、torchMoji.torchmoji.sentence_tokenizer.SentenceTokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: emojize.py プロジェクト: milkshakegum/discord-nlp-smily-bot

    def __init__(self):
        with open(vocab_file_path, 'r') as f:
            vocabulary = json.load(f)

        max_sentence_length = 100
        self.st = SentenceTokenizer(vocabulary, max_sentence_length)
        self.model = torchmoji_emojis(model_weights_path)

コード例 #2

0

ファイルを表示

def test_dataset_split_parameter():
    """ Dataset is split in the desired ratios
    """
    split_parameter = [0.7, 0.1, 0.2]
    st = SentenceTokenizer(vocab, 30)

    result, result_dicts, _ = st.split_train_val_test(sentences,
                                                      dicts,
                                                      split_parameter,
                                                      extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    assert len(train) == len(sentences) * split_parameter[0]
    assert len(val) == len(sentences) * split_parameter[1]
    assert len(test) == len(sentences) * split_parameter[2]

    assert len(train_dicts) == len(dicts) * split_parameter[0]
    assert len(val_dicts) == len(dicts) * split_parameter[1]
    assert len(test_dicts) == len(dicts) * split_parameter[2]

コード例 #3

0

ファイルを表示

def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = torchmoji_feature_encoding(PRETRAINED_PATH)
    print(model)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
    encoding = model(tokenized)

    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences,
                       np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))

コード例 #4

0

ファイルを表示

ファイル: sentiment.py プロジェクト: Obs01ete/chatbot

    def __init__(self):
        """
        Ctor.
        """

        # Automatically download weights
        if not os.path.isfile(PRETRAINED_PATH):
            os.system(
                "(cd torchMoji && python scripts/download_weights_yes.py)")

        # Instanciate a pytorch model
        self._model = torchmoji_emojis(weight_path=PRETRAINED_PATH)

        # Load vocabulary
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        # Create tokenizer to split a sentence into words
        self._st = SentenceTokenizer(vocabulary, self._max_message_len_words)

        # Load a mapping in neural network prediction to smileys
        emoji_codes_path = os.path.join(ROOT_PATH, "data", "emoji_codes.json")
        with open(emoji_codes_path, 'r') as f:
            self._emoji_codes = json.load(f)

        # This is a reduction of 64 smileys into there "happiness" bool flag
        with open("sentiment.json", 'r') as f:
            self._sentiments = json.load(f)

        pass

コード例 #5

0

ファイルを表示

def load_benchmark(path, vocab, extend_with=0):
    """ Loads the given benchmark dataset.

        Tokenizes the texts using the provided vocabulary, extending it with
        words from the training dataset if extend_with > 0. Splits them into
        three lists: training, validation and testing (in that order).

        Also calculates the maximum length of the texts and the
        suggested batch_size.

    # Arguments:
        path: Path to the dataset to be loaded.
        vocab: Vocabulary to be used for tokenizing texts.
        extend_with: If > 0, the vocabulary will be extended with up to
            extend_with tokens from the training set before tokenizing.

    # Returns:
        A dictionary with the following fields:
            texts: List of three lists, containing tokenized inputs for
                training, validation and testing (in that order).
            labels: List of three lists, containing labels for training,
                validation and testing (in that order).
            added: Number of tokens added to the vocabulary.
            batch_size: Batch size.
            maxlen: Maximum length of an input.
    """
    # Pre-processing dataset
    with open(path, 'rb') as dataset:
        if IS_PYTHON2:
            data = pickle.load(dataset)
        else:
            data = pickle.load(dataset, fix_imports=True)

    # Decode data
    try:
        texts = [unicode(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    # Extract labels
    labels = [x['label'] for x in data['info']]

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(texts,
                                                   labels,
                                                   [data['train_ind'],
                                                    data['val_ind'],
                                                    data['test_ind']],
                                                   extend_with=extend_with)
    return {'texts': texts,
            'labels': labels,
            'added': added,
            'batch_size': batch_size,
            'maxlen': maxlen}

コード例 #6

0

ファイルを表示

def test_id_to_sentence():
    """Tokenizing and converting back preserves the input.
    """
    vb = {'CUSTOM_MASK': 0, 'aasdf': 1000, 'basdf': 2000}

    sentence = 'aasdf basdf basdf basdf'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == sentence

コード例 #7

0

ファイルを表示

    def __init__(self, max_sentence_length=30):
        # Tokenizing using the dictionary
        with open(VOCAB_PATH, 'r') as f:
            self.vocabulary = json.load(f)

        self.st = SentenceTokenizer(self.vocabulary, max_sentence_length)

        # Loading the model
        self.model = torchmoji_emojis(PRETRAINED_PATH)

コード例 #8

0

ファイルを表示

def test_id_to_sentence_with_unknown():
    """Tokenizing and converting back preserves the input, except for unknowns.
    """
    vb = {'CUSTOM_MASK': 0, 'CUSTOM_UNKNOWN': 1, 'aasdf': 1000, 'basdf': 2000}

    sentence = 'aasdf basdf ccc'
    expected = 'aasdf basdf CUSTOM_UNKNOWN'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == expected

コード例 #9

0

ファイルを表示

def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(
        texts,
        labels, [data['train_ind'], data['val_ind'], data['test_ind']],
        extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                         dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))

コード例 #10

0

ファイルを表示

def test_dataset_split_explicit():
    """ Dataset is split according to given indices
    """
    split_parameter = [train_ind, val_ind, test_ind]
    st = SentenceTokenizer(vocab, 30)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    result, result_dicts, added = st.split_train_val_test(sentences,
                                                          dicts,
                                                          split_parameter,
                                                          extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    tokenized = tokenized

    for i, sentence in enumerate(sentences):
        if i in train_ind:
            assert tokenized[i] in train
            assert dicts[i] in train_dicts
        elif i in val_ind:
            assert tokenized[i] in val
            assert dicts[i] in val_dicts
        elif i in test_ind:
            assert tokenized[i] in test
            assert dicts[i] in test_dicts

    assert len(train) == len(train_ind)
    assert len(val) == len(val_ind)
    assert len(test) == len(test_ind)
    assert len(train_dicts) == len(train_ind)
    assert len(val_dicts) == len(val_ind)
    assert len(test_dicts) == len(test_ind)

コード例 #11

0

ファイルを表示

def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    expected = [
        np.array([36, 4, 8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31, 6, 30, 15, 13]),
        np.array([54, 44, 9, 50, 49]),
        np.array([46, 5, 27, 35, 34]),
        np.array([55, 32, 27, 1, 37]),
        np.array([48, 11, 6, 31, 9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokens, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    prob = model(tokens)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(list(prob)):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])

コード例 #12

0

ファイルを表示

class Botmoji():
    def __init__(self, max_sentence_length=30):
        # Tokenizing using the dictionary
        with open(VOCAB_PATH, 'r') as f:
            self.vocabulary = json.load(f)

        self.st = SentenceTokenizer(self.vocabulary, max_sentence_length)

        # Loading the model
        self.model = torchmoji_emojis(PRETRAINED_PATH)

    def emojize_text(self, text, maxemojis, minconfidence):
        prob = self.encode(text)

        # Top emoji ID
        emoji_ids = top_emojis(prob, maxemojis, minconfidence)
        if len(emoji_ids) == 0:
            return ''

        # Map to emojis
        emojis = map(lambda x: EMOJIS[x], emoji_ids)
        return emoji.emojize(' '.join(emojis), use_aliases=True)

    def encode(self, text):
        # Running predictions
        tokenized, _, _ = self.st.tokenize_sentences([text])
        # Getting emojis probabilities
        prob = self.model(tokenized)[0]
        return prob

    def encode_multiple(self, texts):
        filtered_texts = ['_' if text == '' else text for text in texts]
        # Running predictions
        tokenized, _, _ = self.st.tokenize_sentences(filtered_texts)
        # Getting emojis probabilities
        prob = self.model(tokenized)
        return prob

コード例 #13

0

ファイルを表示

ファイル: emojize.py プロジェクト: milkshakegum/discord-nlp-smily-bot

class Emojize:
    def __init__(self):
        with open(vocab_file_path, 'r') as f:
            vocabulary = json.load(f)

        max_sentence_length = 100
        self.st = SentenceTokenizer(vocabulary, max_sentence_length)
        self.model = torchmoji_emojis(model_weights_path)

    def predict(self, text):
        if not isinstance(text, list):
            text = [text]

        tokenized, _, _ = self.st.tokenize_sentences(text)
        # print(tokenized)
        prob = self.model(tokenized)[0]
        # print("Prob", prob)
        emoji_ids = top_elements(prob, 1)
        print("Emo Id: ", emoji_ids)
        # Emoji map in emoji_overview.png
        # print(EMOJIS)
        emojis = EMOJIS[emoji_ids[0]]
        # print("emojis", emojis)
        return emojis

コード例 #14

0

ファイルを表示

    argparser = argparse.ArgumentParser()
    argparser.add_argument('--text',
                           type=str,
                           required=True,
                           help="Input text to emojize")
    argparser.add_argument('--maxlen',
                           type=int,
                           default=30,
                           help="Max length of input text")
    args = argparser.parse_args()

    # Tokenizing using dictionary
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, args.maxlen)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)
    # Running predictions
    tokenized, _, _ = st.tokenize_sentences([args.text])
    # Get sentence probability
    prob = model(tokenized)[0]

    # Top emoji id
    emoji_ids = top_elements(prob, 5)

    # map to emojis
    emojis = map(lambda x: EMOJIS[x], emoji_ids)

    print(

コード例 #15

0

ファイルを表示

    coverage_result = [p]
    print('Calculating coverage for {}'.format(p))
    with open(p, 'rb') as f:
        if IS_PYTHON2:
            s = pickle.load(f)
        else:
            s = pickle.load(f, fix_imports=True)

    # Decode data
    try:
        s['texts'] = [unicode(x) for x in s['texts']]
    except UnicodeDecodeError:
        s['texts'] = [x.decode('utf-8') for x in s['texts']]

    # Own
    st = SentenceTokenizer({}, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=10000)
    coverage_result.append(coverage(tests[2]))

    # Last
    st = SentenceTokenizer(vocab, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=0)
    coverage_result.append(coverage(tests[2]))

    # Full

コード例 #16

0

ファイルを表示

                           type=int,
                           default=30,
                           help="Max length of input text")
    args = argparser.parse_args()
    sentence_probs = []
    retokenized_sentences = []
    output_path = os.path.join(os.path.dirname(args.filepath),
                               'sentence_emojis.pkl')
    retokenized_sentences_output_path = os.path.join(
        os.path.dirname(args.filepath), 'retokenized_sentences.pkl')

    # Tokenizing using dictionary
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, args.maxlen)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)

    sentences = load_pickle(args.filepath)
    # TODO: encode multiple sentences at once.
    #  Needs TorchMoji module to handle empty sentences and output equal probabilities
    # flattened_sentences = [utterance for conversation in sentences for utterance in conversation]
    # print('Encoding sentences ...')
    # flattened_tokenized, _, _ = st.tokenize_sentences(flattened_sentences)
    # flattened_probs = model(flattened_tokenized)
    # print('TorchMoji encoding done.')
    idx = 0
    for conversation in sentences:
        idx += 1

コード例 #17

0

ファイルを表示

    'This is the shit'
]


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


maxlen = 30

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, maxlen)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = torchmoji_emojis(PRETRAINED_PATH)
print(model)
print('Running predictions.')
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
prob = model(tokenized)

for prob in [prob]:
    # Find top emojis for each sentence. Emoji ids (0-63)
    # correspond to the mapping in emoji_overview.png
    # at the root of the torchMoji repo.
    print('Writing results to {}'.format(OUTPUT_PATH))
    scores = []
    for i, t in enumerate(TEST_SENTENCES):

コード例 #18

0

ファイルを表示

    },
    {
        'label': 'sentence 5'
    },
    {
        'label': 'sentence 6'
    },
    {
        'label': 'sentence 7'
    },
    {
        'label': 'sentence 8'
    },
    {
        'label': 'sentence 9'
    },
]

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)
st = SentenceTokenizer(vocab, 30)

# Split using the default split ratio
print(st.split_train_val_test(DATASET, INFO_DICTS))

# Split explicitly
print(
    st.split_train_val_test(DATASET,
                            INFO_DICTS, [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
                            extend_with=1))

コード例 #19

0

ファイルを表示

"""
Take a given list of sentences and turn it into a numpy array, where each
number corresponds to a word. Padding is used (number 0) to ensure fixed length
of sentences.
"""

from __future__ import print_function, unicode_literals
import json
from torchMoji.torchmoji.sentence_tokenizer import SentenceTokenizer

with open('../model/vocabulary.json', 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 30)
test_sentences = [
    '\u2014 -- \u203c !!\U0001F602',
    'Hello world!',
    'This is a sample tweet #example',
]

tokens, infos, stats = st.tokenize_sentences(test_sentences)

print(tokens)
print(infos)
print(stats)

コード例 #20

0

ファイルを表示

ファイル: sentiment.py プロジェクト: Obs01ete/chatbot

class Sentiment:
    """
    Wrapper class for torchMoji.
    Repo:
    https://github.com/huggingface/torchMoji
    Original DeepMoji paper:
    "Using millions of emoji occurrences to learn any-domain
    representations for detecting sentiment, emotion and sarcasm"
    https://arxiv.org/pdf/1708.00524.pdf
    """

    # Maximal lenght of a sentense in words
    _max_message_len_words = 30
    # Top K smiley predictions to derive sentiment from
    _top_k_predictions = 5

    def __init__(self):
        """
        Ctor.
        """

        # Automatically download weights
        if not os.path.isfile(PRETRAINED_PATH):
            os.system(
                "(cd torchMoji && python scripts/download_weights_yes.py)")

        # Instanciate a pytorch model
        self._model = torchmoji_emojis(weight_path=PRETRAINED_PATH)

        # Load vocabulary
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        # Create tokenizer to split a sentence into words
        self._st = SentenceTokenizer(vocabulary, self._max_message_len_words)

        # Load a mapping in neural network prediction to smileys
        emoji_codes_path = os.path.join(ROOT_PATH, "data", "emoji_codes.json")
        with open(emoji_codes_path, 'r') as f:
            self._emoji_codes = json.load(f)

        # This is a reduction of 64 smileys into there "happiness" bool flag
        with open("sentiment.json", 'r') as f:
            self._sentiments = json.load(f)

        pass

    def __call__(self, message: str) -> bool:
        """
        Perform inference.
        :param message: text sentense
        :return: True if a sentence has positive sentiment, False - if negative
        """

        assert isinstance(message, str)

        # Tokenize sentence
        tokens, _, _ = self._st.tokenize_sentences([message])

        # Neural network inference
        prob, *_ = self._model(tokens)

        # Analyse only top K predictions out of 64
        top = top_elements(prob, self._top_k_predictions)

        if False:
            # See original top-K smileys before reduction
            top_emoji = [self._emoji_codes[str(i)] for i in top]
            print(" ".join(
                [emoji.emojize(e, use_aliases=True) for e in top_emoji]))

        # Reduce to "happiness" flag
        votes = [self._sentiments[str(e)] for e in top]
        result = majority_vote(votes)

        return result