Beispiel #1
0
def preprocess(train_content, train_label, test_content, test_label):
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_sequences(train_content)
    x_train_seq = tokenizer.texts_to_sequences(train_content)
    x_test_seq = tokenizer.texts_to_sequences(test_content)
    x_train = sequence.pad_sequences(x_train_seq, maxlen=MAX_LEN)
    x_test = sequence.pad_sequences(x_test_seq, maxlen=MAX_LEN)
    y_train = np.array(train_label)
    y_test = np.array(test_label)
    return x_train, y_train, x_test, y_test, tokenizer.word_index
Beispiel #2
0
def test_tokenizer():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    tokenizer = Tokenizer(num_words=10)
    tokenizer.fit_on_texts(texts)

    sequences = []
    for seq in tokenizer.texts_to_sequences_generator(texts):
        sequences.append(seq)
    assert np.max(np.max(sequences)) < 10
    assert np.min(np.min(sequences)) == 1

    tokenizer.fit_on_sequences(sequences)

    for mode in ['binary', 'count', 'tfidf', 'freq']:
        matrix = tokenizer.texts_to_matrix(texts, mode)
Beispiel #3
0
def test_tokenizer():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    tokenizer = Tokenizer(num_words=10)
    tokenizer.fit_on_texts(texts)

    sequences = []
    for seq in tokenizer.texts_to_sequences_generator(texts):
        sequences.append(seq)
    assert np.max(np.max(sequences)) < 10
    assert np.min(np.min(sequences)) == 1

    tokenizer.fit_on_sequences(sequences)

    for mode in ['binary', 'count', 'tfidf', 'freq']:
        matrix = tokenizer.texts_to_matrix(texts, mode)
    def vectorize(self, data_set):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(data_set)
        sequences = tokenizer.fit_on_sequences(data_set)

        # word_index = tokenizer.word_index
        data_feature = pad_sequences(sequences, maxlen=self.max_len)

        return data_feature
Beispiel #5
0
def test_tokenizer():
    texts = [
        'The cat sat on the mat', 'The dog sat on the log',
        'Dogs and cats living together'
    ]
    # num_words:处理的最大单词数量。被限制处理数据集中最常见的n个单词
    tokenizer = Tokenizer(num_words=10)
    tokenizer.fit_on_texts(texts)
    print('word_counts: ', tokenizer.word_counts)  # 在训练期间出现的次数
    print('word_docs: ', tokenizer.word_docs)  # 在训练期间,单词出现在了几份文档中
    print('word_index: ', tokenizer.word_index)  # 排名索引
    print('document_count: ', tokenizer.document_count)  # 被训练的文档数

    # 测试序列生成器
    sequences = []
    for seg in tokenizer.texts_to_sequences_generator(texts):
        sequences.append(seg)

    # 测试文本序列以矩阵的形式表达特征
    tokenizer.fit_on_sequences(sequences)
    for mode in ['binary', 'count', 'tfidf', 'freq']:
        matrix = tokenizer.texts_to_matrix(texts, mode)
        print(mode, " : ", matrix)
Beispiel #6
0
    def model_setup(self):
        """
        Sets up the model for generating a poem with a tokenizer, and splits the conversations
        into sequences.

        Void -> [Tupleof Tokenizer Int]
        """
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.conversations)
        tokenizer.fit_on_sequences(self.conversations)

        # n-gram sequences
        self.input_sequences = []
        for line in self.conversations:
            token_list = tokenizer.texts_to_sequences([line])[0]
            for i in range(1, len(token_list)):
                n_gram_sequence = token_list[:i + 1]
                self.input_sequences.append(n_gram_sequence)
        max_sequence_len = max([len(x) for x in self.input_sequences])
        self.input_sequences = np.array(
            pad_sequences(self.input_sequences,
                          maxlen=max_sequence_len,
                          padding='pre'))
        return (tokenizer, max_sequence_len)
Beispiel #7
0

publ_test = get_data()
content_test = publ_test['content']
classes_test = publ_test['class']

# - - - - - - - - - - - - - - - - - - - - - - - -
# Предобработка данных
tk = Tokenizer()
tk.fit_on_texts(content_test)
textSequences_test = tk.texts_to_sequences(content_test)

num_words = 80059
num_classes = 8
tk = Tokenizer(num_words=num_words)
tk.fit_on_sequences(textSequences_test)

X_test = tk.sequences_to_matrix(textSequences_test, mode='tfidf')
y_test = keras.utils.to_categorical(classes_test, num_classes)

# - - - - - - - - - - - - - - - - - - - - - - - -
# Классификация новостей
print('\n Результаты:')
for i in range(len(X_test)):
    prediction = model.predict(np.array([X_test[i]]))
    pred = np.argsort(-prediction)
    print("[", i, "] ", publ_test['title'][i])
    print("Тренировочная категория:    ", kfu_classes[classes_test[i]])
    print("Предсказанная категория:    ", kfu_classes[pred[0][0]])
    print("Вероятность класса: %.2f%%" % (prediction[0][pred[0][0]] * 100))
Beispiel #8
0
            if label_type == "neg":
                labels.append(0)
            else:
                labels.append(1)

# b.对数据进行分词
"""
使用预训练的词嵌入对训练数据很少的问题特别有用,将训练数据限定为前200个样本,因此需要读取200个样本之后,学习对电影评论进行分类
"""
maxlen = 100  # 每条评论取前100个单词后截断原始评论
training_samples = 200  # 200条评论进行训练
validation_samples = 10000  # 10000条评论进行验证
max_words = 10000  # 只考虑所有评论中最常见的10000个单词

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_sequences(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print("Found %s unique tokens." % (len(word_index)))

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print("Shape of data tensor:", data.shape)
print("Shape of label tensor:", labels.shape)

indices = np.arange(data.shape[0])  # 将数据划分成训练集和验证集,但首先要打乱数据,因为原始的数据是好评在前,差评在后
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
Beispiel #9
0
                                          verbose=2)

end_time = time.time()
average_time_per_epoch = (end_time - start_time) / epochs
print("avg sec per epoch:", average_time_per_epoch)

# run simple linear regression to compare performance

#based on grid search done by:
#https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch08/ch08.ipynb

#the tfidf vectors capture co-occurance statistics, think of each number representing how many times
#a word occured in a text and scaled by word frequency

tfidfTokenizer = Tokenizer(nb_words=max_features)
tfidfTokenizer.fit_on_sequences(X_train.tolist())
X_train_tfidf = np.asarray(
    tfidfTokenizer.sequences_to_matrix(X_train.tolist(), mode="tfidf"))
X_test_tfidf = np.asarray(
    tfidfTokenizer.sequences_to_matrix(X_test.tolist(), mode="tfidf"))

#check tfidf matrix
print(X_train_tfidf)
print(X_train_tfidf.shape, X_test_tfidf.shape)

from sklearn.linear_model import LogisticRegression

model_tfidf_reg = LogisticRegression(random_state=0,
                                     C=0.001,
                                     penalty='l2',
                                     verbose=1)
Beispiel #10
0
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_sequences(lines)
    return tokenizer
Beispiel #11
0
                    verbose=2)

end_time = time.time()
average_time_per_epoch = (end_time - start_time) / epochs
print("avg sec per epoch:", average_time_per_epoch)

# run simple linear regression to compare performance

#based on grid search done by:
#https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch08/ch08.ipynb

#the tfidf vectors capture co-occurance statistics, think of each number representing how many times
#a word occured in a text and scaled by word frequency

tfidfTokenizer = Tokenizer(nb_words=max_features)
tfidfTokenizer.fit_on_sequences(X_train.tolist())
X_train_tfidf = np.asarray(tfidfTokenizer.sequences_to_matrix(X_train.tolist(), mode="tfidf"))
X_test_tfidf = np.asarray(tfidfTokenizer.sequences_to_matrix(X_test.tolist(), mode="tfidf"))

#check tfidf matrix
print(X_train_tfidf)
print(X_train_tfidf.shape, X_test_tfidf.shape)

from sklearn.linear_model import LogisticRegression

model_tfidf_reg = LogisticRegression(random_state=0, C=0.001, penalty='l2', verbose=1)
model_tfidf_reg.fit(X_train_tfidf, y_train)

from sklearn.metrics import accuracy_score
#calculate test and train accuracy
print("train acc:", accuracy_score(y_test, model_tfidf_reg.predict(X_train_tfidf)))
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras import optimizers
import string
from keras.layers import Dropout
#PRE_PROCESSING THE CLEAN DATASET
#lines = training_set.split('\n')
training_set_clean = [
    line.rstrip('\n')
    for line in open('training_set_clean.txt', encoding='ISO-8859-1')
]
#lines=[l.split('\n') for l in training_set_clean]
#import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_sequences(training_set_clean)
tokenizer.fit_on_texts(training_set_clean)
sequences = tokenizer.texts_to_sequences(training_set_clean)

vocab_size = len(tokenizer.word_index) + 1

sequences = array(sequences)
X_train = sequences[:, :-1]
y_train = sequences[:, -1]
y_train = to_categorical(y_train, num_classes=vocab_size)
seq_length = X_train.shape[1]
print(X_train[0])

#TRAIN THE MODEL
regressor = Sequential()
regressor.add(Embedding(vocab_size, 20, input_length=seq_length))
# Тренировочные данные
tokenizer = Tokenizer()
tokenizer.fit_on_texts(content_train)
textSequences = tokenizer.texts_to_sequences(content_train)

X_train, y_train, X_test, y_test = split_data(textSequences, classes_train,
                                              0.9)
total_words = len(tokenizer.word_index)
print('В словаре {} слов'.format(total_words))

num_words = 80059
num_classes = 8
print(u'Векторизация...')
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_sequences(X_train)
X_train = tokenizer.sequences_to_matrix(X_train, mode='tfidf')
tokenizer.fit_on_sequences(X_test)
X_test = tokenizer.sequences_to_matrix(X_test, mode='tfidf')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

epochs = 10  # количество эпох\итераций для обучения
total_categories = 8

print(u'Строим классификатор...')
model = Sequential()
model.add(Dense(256, input_shape=(num_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(total_categories))
def convModel(tweets, stances, tweets_test, stances_test):
    #General Parameters
    global max
    embeding_dim = 200
    dropout_prob = (0.0, 0.5)
    batch_size = 64
    num_epochs = 20

    print('Fitting tokenizer')
    tokenizer = Tokenizer()
    tokenizer.fit_on_sequences(tweets + tweets2)
    max_length = max([len(s.split()) for s in tweets + tweets2])
    print('max_length', max_length)

    vocab_size = len(tokenizer.word_index) + 1

    #Train and test split
    print('Train and test split')
    x_train, x_test, y_train, y_test = train_test_split(tweets, stances, test_size=0.2)
    print('x_train: ', len(x_train), 'x_test', len(x_test))


    #Training data
    #traindata = np.array(x_train)
    #testdata = np.array(x_test)

    trainTokens = tokenizer.texts_to_sequences(x_train)
    Xtrain = pad_sequences(trainTokens, maxlen=max_length, padding='post')
    XtestTokens = tokenizer.texts_to_sequences(x_test)
    Xtest = pad_sequences(XtestTokens, maxlen=max_length, padding='post')
    #============ TEST DATA =============================================
    #testgroup = np.array(tweets_test)
    #testGroupTokens = tokenizer.texts_to_sequences(tweets_test)
    #XtestGroup = pad_sequences(testGroupTokens, maxlen=max_length, padding='post')
    #print('Xtrain padding: ', len(Xtrain), 'Xtest padding: ', len(Xtest), 'XtestGroup padding: ', len(XtestGroup))

    #Convert stances to categorical output
    y_test = np_utils.to_categorical(y_test, num_classes=3)
    y_train = np_utils.to_categorical(y_train, num_classes=3)
    y_testGroup = np_utils.to_categorical(stances_test, num_classes=3)
    print('y_test: ', len(y_test), 'y_train: ', len(y_train), 'y_testGroup: ', len(stances_test))


    print('Loading embeddings..')
    #load word2vec and create embedding layer
    wv_from_bin = KeyedVectors.load_word2vec_format(datapath('E:/glove/glove.twitter.27B.200dGINSIM.txt'),binary=False)
    embedding_vectors = get_weight_matrix2(wv_from_bin, tokenizer.word_index.items())
    embedding_layer = Embedding(vocab_size, embeding_dim, weights=[embedding_vectors], input_length=max_length, trainable=False)

    #Create the model
    print('Create and compile the model..')
    model = createModelC(max_length, embedding_layer)
    model.compile(loss="categorical_hinge", optimizer="adam", metrics=[f1])
    model.summary(85)

    print('Fitting the model..')
    history = model.fit(Xtrain, y_train, batch_size=batch_size, epochs=num_epochs,
                        validation_data=(Xtest, y_test), verbose=2)
    print('History', history.history)

    # evaluate
    print('Predicting (training)..')
    ypred = model.predict(Xtest)
    print('Accuracy (TRAIN): %f' % (model.evaluate(Xtest,y_test)[0]*100))
    print('FScore (TRAIN): %f' % (f1(y_test, ypred)*100))

    print('Predicting (testing)..')
Beispiel #15
0
from keras.preprocessing.text import Tokenizer
import numpy as np

texts = [
    'The cat sat on the mat.', 'The dog sat on the log.',
    'Dogs and cats living together.'
]
tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(texts)

sequences = []
for seq in tokenizer.texts_to_sequences_generator(texts):
    sequences.append(seq)
assert np.max(np.max(sequences)) < 10
assert np.min(np.min(sequences)) == 1

tokenizer.fit_on_sequences(sequences)

for mode in ['binary', 'count', 'tfidf', 'freq']:
    matrix = tokenizer.texts_to_matrix(texts, mode)

print("texts:", texts)
print("=> Found %s unique tokens <=" % len(tokenizer.word_index))
Beispiel #16
0
class SentimentLSTM:
    def __init__(self):
        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.stop_words = []
        self.model = None

    def load_stop_word(self, path='E:/dataset/NLP/stopwords'):
        with open(path, 'r') as f:
            for line in f:
                content = line.strip()
                self.stop_words.append(content.decode('utf-8'))

    def jieba_cut(self, line):
        lcut = jieba.lcut(line)
        cut = [x for x in lcut if x not in self.stop_words]
        cut = " ".join(cut)
        return cut

    def load_cuted_corpus(self, dir, input):
        f = open(dir + '/' + input, 'r')
        lines = f.readlines()
        texts = []
        labels = []
        for line in lines:
            fields = line.split()
            rate = int(fields[0])
            if rate == 0 or rate == 3:
                continue
            elif rate < 3:
                rate = 0
            else:
                rate = 1
            cont = fields[1:]
            cont = " ".join(cont)
            texts.append(cont)
            labels.append(rate)

        self.tokenizer.fit_on_sequences(texts)
        f.close()
        return texts, labels

    def load_data(self):
        x, y = self.load_cuted_corpus('corpus', 'review.csv')
        x = self.tokenizer.texts_to_sequences(x)
        x = S.pad_sequences(x, maxlen=sentence_max_len)
        y = to_categorical(y, num_classes=2)
        return ((x[0:500000], y[0:500000]), (x[500000:], y[500000:]))

    def train(self, epochs=50):
        print('building model========================')
        self.model = SentimentLSTM.build_model()

        print('loading data===========================')
        (text_train, rate_train), (text_test, rate_test) = self.load_data()

        print("training===============================")
        self.model.fit(text_train, rate_train, batch_size=1000, epochs=epochs)
        self.model.save('')
        score = self.model.evaluate(text_test, rate_test)
        print(score)

    def load_trained_model(self, path):
        model = SentimentLSTM.build_model()
        model.load_weights(path)
        return model

    def predict_text(self, text):
        if self.model == None:
            self.model = self.load_trained_model(model_savepath)
            self.load_stop_word()
            self.load_cuted_corpus('corpus', 'review.csv')

        vect = self.jieba_cut(text)
        vect = vect.encode('utf-8')
        vect = self.tokenizer.texts_to_sequences([
            vect,
        ])
        print(vect)
        return self.model.predict_classed(S.pad_sequences(np.array(vect), 100))

    def build_model():
        model = Sequential()
        model.add(Embedding(vocab_size, 256, input_length=sentence_max_len))
        model.add(Bidirectional(LSTM(128, implementation=2)))
        model.add(Dropout(0.5))
        model.add(Dense(2, activation='relu'))
        model.compile('RMSprop',
                      'categorical_crossentropy',
                      metrics=['accuracy'])
        return model
Beispiel #17
0
target = [d.topic for d in docarr]

# text_path = d_class.text_file
# label_path = d_class.gnd_file
# with open(text_path) as f1, open(label_path) as f2:
#     data = [text.strip() for text in f1]
#     target = [int(label.rstrip('\n')) for label in f2.readlines()]
# tokenizer = Tokenizer(char_level=False)
# tokenizer.fit_on_texts(data)
# sequences_full = tokenizer.texts_to_sequences(data)
# tokenizer.fit_on_sequences(sequences_full)
# word_index = tokenizer.word_index

tokenizer = Tokenizer(char_level=False)
tokenizer.word_index = word_index
tokenizer.fit_on_sequences(sequences_full)
seq_lens = [len(s) for s in sequences_full]
MAX_SEQ_LEN = max(seq_lens)
print("Total: %s short texts" % format(len(docarr), ","),
      ' %s unique tokens.' % len(word_index))
print("Average length: %d" % np.mean(seq_lens),
      ", Max length: %d" % max(seq_lens))

X = pad_sequences(sequences_full, maxlen=MAX_SEQ_LEN)
y = target

#################################################
# Preparing embedding matrix
#################################################

EMBED_DIM = 300
Beispiel #18
0
all_participants_mix_stopwords = all_participants.copy()
all_participants_mix_stopwords['answer'] = all_participants_mix_stopwords.apply(lambda row: text_to_wordlist(row.answer, remove_stopwords=False).split(), axis=1)

words = [w for w in all_participants_mix['answer'].tolist()]
words = set(itertools.chain(*words))
vocab_size = len(words)

words_stop = [w for w in all_participants_mix_stopwords['answer'].tolist()]
words_stop = set(itertools.chain(*words_stop))
vocab_size_stop = len(words_stop)

windows_size = WINDOWS_SIZE
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(all_participants_mix['answer'])
tokenizer.fit_on_sequences(all_participants_mix['answer'])

all_participants_mix['t_answer'] = tokenizer.texts_to_sequences(all_participants_mix['answer'])

def test_model(text, model):
    word_list = text_to_wordlist(text)
    list_of_words = word_list.split(" ")
    sequences = tokenizer.texts_to_sequences([word_list])
    word_tokens = sequences[0]
    size = len(word_tokens)
    test_phrases = []
    for i in range(size):
        tokens = word_tokens[i:min(i+windows_size,size)]
        test_phrases.append(tokens)
    sequences_input = test_phrases
    sequences_input =  pad_sequences(sequences_input, value=0, padding="post", maxlen=windows_size)
Beispiel #19
0
def tokenizer_fit_xvals(t_xvals):
    t = Tokenizer(num_words=None, lower=False, oov_token="_NA")
    t.fit_on_sequences(t_xvals)
    return t
Beispiel #20
0
test content = d['CONTENT'].iloc[test idx]

# In[05]: Jawaban Teori No 3

import numpy as np
from sklearn.model_selection import train_test_split
X, y = np.arange(10).reshape((5, 2)), range(5)
X

# In[06]: Jawaban No 5

from keras.preprocessing.text import Tokenizer
ns = Tokenizer(num_words=2)
yn = ["Jawaban No5", "yes", "Berhasil"]
ns.fit_on_texts(yn)
ns.fit_on_sequences(yn)
ns.word_index

# In[07]: Soal No 6

d_train_inputs = tokenizer.texts to matrix(train content,mode='t
df') 
d_test_inputs = tokenizer.texts to matrix(test content, mode='t
df')

# In[08]: Jawaban No 7

d_train inputs = d_train_inputs/np.amax(np.absolute(d_train_inputs))
d_test_inputs = d_test_inputs/np.amax(np.absolute(d_test_inputs))

# In[09]: Jawaban No 8