Beispiel #1
0
class OneHot(object):
    def __init__(self, vocab, char_level=False, max_len=100):
        self.char_level = char_level
        self.max_len = max_len
        self.vocab = vocab

        self.tokenizer = Tokenizer(num_words=self.max_len,
                                   char_level=self.char_level)
        self.tokenizer.fit_on_texts(self.vocab)

    def encode(self, s):
        s_int = self.tokenizer.texts_to_sequences(s)
        s_int = [[x[0] - 1] for x in s_int]
        s_oh = to_categorical(s_int)
        return s_oh

    def decode(self, arr):
        s_int = np.argmax(arr, axis=1)
        s_int = [[x + 1] for x in s_int]
        s_list = self.tokenizer.sequences_to_texts(s_int)
        if self.char_level:
            sep = ''
        else:
            sep = ' '
        return sep.join(s_list)
Beispiel #2
0
def embed_flair(texts, max_length=100, max_words=1000):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    texts = tokenizer.sequences_to_texts(sequences)

    sentence_embeddings = []
    padding = np.zeros(embedding_features)
    count = 0
    step = 3
    max = len(texts)
    for text in texts:
        sentence_embedding = []
        paddings = []
        sentence = Sentence(text)
        embeddings_flair.embed(sentence)
        for token in sentence:
            sentence_embedding.append(token.embedding.cpu().numpy())
        for i in range(max_length - len(sentence_embedding)):
            paddings.append(padding)
        if len(paddings) > 0:
            sentence_embedding = np.concatenate([paddings, sentence_embedding],
                                                axis=0)
        else:
            sentence_embedding = np.array(sentence_embedding[:max_length])
        count += 1
        if (100 * count / max > step):
            print(str(step) + '%')
            step += 3
        sentence_embeddings.append(sentence_embedding)

    return np.array(sentence_embeddings)
Beispiel #3
0
 def _train_model(self, model: Word2Vec, texts):
     tokenizer = Tokenizer()
     tokenizer.fit_on_texts(texts)
     texts_seq = tokenizer.sequences_to_texts(
         tokenizer.texts_to_sequences(texts))
     texts_seq = [f.split(" ") for f in texts_seq]
     print("Adding to word2vec vocabulary...")
     model.min_count = 2
     model.build_vocab(texts_seq, update=True)
     print("Training word2vec ...")
     model.train(texts_seq,
                 total_examples=len(texts_seq),
                 epochs=model.epochs)
    def prepareData(self, List):

        allText = ""
        for episode in List:
            allText = allText + " " + self.clean(episode)

        allTextTokenized = word_tokenize(allText)
        tokenizer = Tokenizer(filters="")

        tokenizer.fit_on_texts(allTextTokenized)

        # turn words into sequences
        s = tokenizer.texts_to_sequences(allTextTokenized)
        self.word_idx = tokenizer.word_index
        self.idx_word = tokenizer.index_word
        self.num_words = len(self.word_idx) + 1
        t = tokenizer.sequences_to_texts(s)

        w = [''.join(i) for i in t]
        wseq = []
        for a in w:
            if a != '':
                wseq.append(self.word_idx[a])

        trainingdata = []
        a = 0
        while (a + 60) < len(wseq):
            x = []
            for i in range(a, a + 60):
                x.append(wseq[i])
            y = wseq[a + 60]
            temp = [x, y]
            trainingdata.append(temp)
            a = a + 1

        print("number of training pairs", len(trainingdata))
        print('number of words :', self.num_words)
        self.dataX = []
        self.dataY = []
        for e in trainingdata:
            self.dataX.append(e[0])
            self.dataY.append(e[1])

        # reshape X to be [samples, time steps, features]
        self.X = np.reshape(self.dataX, (len(trainingdata), 60, 1))
        # normalize
        self.X = self.X / float(self.num_words)
        # one hot encode the output variable
        self.y = np_utils.to_categorical(self.dataY)
Beispiel #5
0
class OneHot:
    def __init__(self, docs_tokens, max_doc_len):
        self.tokenizer = Tokenizer(split=None,
                                   lower=False,
                                   filters=None,
                                   oov_token=True)
        self.tokenizer.fit_on_texts(docs_tokens)
        unknown_index = self.tokenizer.word_index.get('<<UNKNOWN>>', None)
        if unknown_index is not None:
            self.tokenizer.word_index.pop('<<UNKNOWN>>', None)
        self.tokenizer.word_index.pop(True, None)
        self.tokenizer.word_index.update({'<<UNKNOWN>>': 1})
        if unknown_index is not None:
            for word, id in self.tokenizer.word_index.items():
                if id > unknown_index:
                    self.tokenizer.word_index[word] = id - 1
        self.tokenizer.oov_token = '<<UNKNOWN>>'
        self.max_doc_len = max_doc_len
        self.max_word_num = self.tokenizer.num_words

    def get_sequence(self, tokens):
        return pad_sequences(self.tokenizer.texts_to_sequences([tokens]),
                             maxlen=self.max_doc_len)[0]

    def get_docs_sequences(self, docs_tokens):
        return pad_sequences(self.tokenizer.texts_to_sequences(docs_tokens),
                             maxlen=self.max_doc_len)

    def get_tokens(self, sequence):
        return self.tokenizer.sequences_to_texts([sequence])[0]

    def get_docs_tokens(self, docs_sequences):
        return self.tokenizer.sequences_to_texts(docs_sequences)

    def get_word_indexes(self):
        return self.tokenizer.word_index
class DataHelper(object):
    def __init__(self, max_len=100, max_num_words=100000):
        self.max_len = max_len
        self.max_num_words = max_num_words
        self.tokenizer = Tokenizer(num_words=max_num_words)
        self.label_columns = [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ]
        self.text_column = 'comment_text'

    def _get_examples(self, data_path, fit_tokenizer):
        data = pd.read_csv(data_path, encoding="utf-8")
        labels = data[self.label_columns].values
        comments = data[self.text_column].values

        if fit_tokenizer:
            self.tokenizer.fit_on_texts(comments)

        comments = self.tokenizer.texts_to_sequences(comments)
        comments = pad_sequences(comments, self.max_len)

        return comments, labels

    def sequences_to_texts(self, sequences):
        return self.tokenizer.sequences_to_texts(sequences)

    def get_data_loader(self,
                        data_path,
                        batch_size,
                        shuffle=True,
                        fit_tokenizer=True,
                        device='cuda:0'):
        comments, labels = self._get_examples(data_path, fit_tokenizer)
        comments, labels = torch.as_tensor(comments).long(), torch.as_tensor(
            labels).float()

        comments = comments.to(device)
        labels = labels.to(device)

        dataset = TensorDataset(comments, labels)
        data_loader = DataLoader(dataset,
                                 batch_size=batch_size,
                                 shuffle=shuffle)
        return data_loader
Beispiel #7
0
class Model:
    def __init__(self, questions: List[str]):
        self.sliding_token_size = 15
        self.path = None
        self.model = None
        self.tokens = re.sub(r'[^a-zA-Z\s]', r' \g<0> ',
                             ' newline '.join(questions)).split()
        self.tokenizer = Tokenizer(filters='')
        self.tokenizer.fit_on_texts(self.tokens)
        self.x = numpy.zeros((len(self.tokens) - self.sliding_token_size,
                              self.sliding_token_size))
        self.y = numpy.zeros((len(self.tokens) - self.sliding_token_size, 1))
        sequences = self.tokenizer.texts_to_sequences(self.tokens)
        for token_n in range(len(self.tokens) - self.sliding_token_size):
            for sliding_token_n in range(self.sliding_token_size):
                self.x[token_n][sliding_token_n] = sequences[
                    token_n + sliding_token_n][0]
            self.y[token_n] = sequences[token_n + self.sliding_token_size][0]

    def save(self):
        pickle.dump(self.model, open(self.path + 'model.bin', 'wb'))
        pickle.dump(self.tokens, open(self.path + 'tokens.bin', 'wb'))
        pickle.dump(self.tokenizer, open(self.path + 'tokenizer.bin', 'wb'))

    def load(self):
        self.model = pickle.load(open(self.path + 'model.bin', 'rb'))
        self.tokens = pickle.load(open(self.path + 'tokens.bin', 'rb'))
        self.tokenizer = pickle.load(open(self.path + 'tokenizer.bin', 'rb'))

    def generate(self, n):
        questions = random.choice(self.tokens)
        while questions.count(' newline ') <= n:
            sequences = pad_sequences(
                [self.tokenizer.texts_to_sequences([questions])[0]],
                self.sliding_token_size)
            questions += ' ' + self.tokenizer.sequences_to_texts(
                [self.model.predict_classes(sequences)])[0]
        return re.sub(r'(?<=[\d\`\'])\s(?=[\d\`\'])', '',
                      questions.replace(' newline ',
                                        '\n').replace(' ?',
                                                      '?')).split('\n')[1:-1]
Beispiel #8
0
# tokenize the sentences
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(train_text_vec)
train_text_vec = tokenizer.texts_to_sequences(train_text_vec)
test_text_vec = tokenizer.texts_to_sequences(test_text_vec)

# pad the sequences
train_text_vec = pad_sequences(train_text_vec, maxlen=MAX_SEQ_LEN)
test_text_vec = pad_sequences(test_text_vec, maxlen=MAX_SEQ_LEN)

print('Number of Tokens:', len(tokenizer.word_index))
print("Max Token Index:", train_text_vec.max(), "\n")

print('Sample Tweet Before Processing:', train["text"].values[0])
print('Sample Tweet After Processing:',
      tokenizer.sequences_to_texts([train_text_vec[0]]), '\n')

print('What the model will interpret:', train_text_vec[0].tolist())

# One Hot Encode Y values:
encoder = LabelEncoder()

y_train = encoder.fit_transform(train['sentiment'].values)
y_train = to_categorical(y_train)

y_test = encoder.fit_transform(test['sentiment'].values)
y_test = to_categorical(y_test)

# get an idea of the distribution of the text values
from collections import Counter
ctr = Counter(train['sentiment'].values)
Beispiel #9
0
# y_train.shape, y_test.shape:  (8982,) (2246,)
print("첫번째 훈련용 뉴스 기사: \n", x_train[0])
# 인덱스 숫자만 리스트 형태로 출력
print(" 첫번째 훈련용 뉴스 기사 레이블: \n", y_train[0])
# 인덱스만 출력

# x_train에 들어있는 숫자들이 각각 어떤 단어들을 나타내는지 확인
word_index = reuters.get_word_index()
print("x데이터의 word_index: \n", word_index)
# 딕셔너리 형태로 각 단어별 인덱스 출력

# 인덱스를 단어로 바꿔주기
from keras.preprocessing.text import Tokenizer
token = Tokenizer()
word_index = token.fit_on_texts(reuters.get_word_index())
word = token.sequences_to_texts(x_train[0:1])
print("x_train의 첫번째 word: \n", word)

# x_train의 shape를 확인하고 싶다?
# 하지만 리스트 형이라 shape가 없다
print(len(x_train[0]))  # 87

# y의 카테고리 개수 출력
category = np.max(y_train) + 1
print("y데이터의 레이블 개수: ", category)  # 46

# y의 유니크한 값들 출력
y_bunpo = np.unique(y_train)
print("y데이터의 분포: \n", y_bunpo)  # 0~45
# y데이터의 분포:
#  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
def model_word2vec(suffix=""):
    for round in range(0, 2):
        rtest = xlrd.open_workbook(filename="切割" + suffix + "test.xls")
        rtrain = xlrd.open_workbook(filename="切割" + suffix + "train.xls")
        r_vocall1 = xlrd.open_workbook(filename="pre处理" + suffix + "test.xls")
        r_vocall2 = xlrd.open_workbook(filename="pre处理" + suffix + "train.xls")
        sheet_test = rtest.sheet_by_index(0)
        sheet_train = rtrain.sheet_by_index(0)
        sheet1_vocall = r_vocall1.sheet_by_index(0)
        sheet2_vocall = r_vocall2.sheet_by_index(0)
        invocal1 = sheet1_vocall.col_values(4)

        invocal2 = sheet2_vocall.col_values(4)
        for i in range(0, len(invocal1)):
            if len(invocal1[i]) == 0:
                invocall = invocal1[:i]
                print("1")
                break

        for i in range(0, len(invocal2)):
            if len(invocal2[i]) == 0:
                print("1")
                invocal2 = invocal2[:i]
                break
        for i in invocal2:
            if i not in invocall:
                invocall.append(i)
        print(len(invocall))
        vocall_size = len(invocall)
        if round == 1:
            ex_tag = sheet_test.col_values(6)
        xtrain = sheet_train.col_values(2 + round * 3)
        ztrain = sheet_train.col_values(0 + round * 3)
        ytrain = sheet_train.col_values(1 + round * 3)
        xtest = sheet_test.col_values(2 + round * 3)
        ztest = sheet_test.col_values(0 + round * 3)
        ytest = sheet_test.col_values(1 + round * 3)

        for i in range(0, len(xtrain)):
            if len(xtrain[i]) == 0:
                xtrain = xtrain[:i]
                ztrain = ztrain[:i]
                ytrain = ytrain[:i]
                break
        for i in range(0, len(xtest)):
            if len(xtest[i]) == 0:
                xtest = xtest[:i]
                ytest = ytest[:i]
                ztest = ztest[:i]
                break
        print(round * 3)
        print(len(xtrain), "xtrain")
        print(len(ytrain), "ytrain")
        print(len(xtest), "xtest")
        print(len(ytest), "ytest")
        if round == 1:
            other = sheet_train.cell(0, 13).value
            other = int(other)
            print(other)
            if other == 1:
                xtrain = xtrain + sheet_train.col_values(9)
                ytrain = ytrain + sheet_train.col_values(8)
                ztrain = ztrain + sheet_train.col_values(7)

        tokenizer = Tokenizer(num_words=vocall_size)
        tokenizer.fit_on_texts(invocall)
        xtrain = tokenizer.texts_to_sequences(xtrain)
        xtest = tokenizer.texts_to_sequences(xtest)
        maxlen = 0
        for i in xtrain:
            if len(i) > maxlen:
                maxlen = len(i)
        for i in xtest:
            if len(i) > maxlen:
                maxlen = len(i)
        print(maxlen, "maxlen")
        xtrain = pad_sequences(xtrain, padding='post', maxlen=maxlen)
        xtest = pad_sequences(xtest, padding='post', maxlen=maxlen)
        print(len(ytrain), len(xtrain))
        print(len(ytest), len(xtest))
        for i in range(0, len(ytrain)):
            ytrain[i] = int(ytrain[i])
        for i in range(0, len(ytest)):
            ytest[i] = int(ytest[i])
        modelw2v = gensim.models.Word2Vec.load("word2vec_150_lstm.model")
        embedding_matrix = np.zeros(shape=(vocall_size + 1, 150))
        for word, i in invocall():
            embedding_vector = modelw2v[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        embedding_size = 150
        hidden_layer_size = 64
        batch_size = 128
        num_epochs = 3
        model = Sequential()
        model.add(
            Embedding(vocall_size + 1,
                      embedding_size,
                      trainable=False,
                      weights=[embedding_matrix],
                      input_length=maxlen))
        model.add(SpatialDropout1D(0.2))
        model.add(Attention())
        model.add(LSTM(hidden_layer_size, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(1))
        model.add(Activation("sigmoid"))
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        model.summary()

        history = model.fit(xtrain,
                            ytrain,
                            epochs=7 + round * 5,
                            batch_size=64)
        loss, accuracy = model.evaluate(xtest, ytest)
        print(loss, accuracy)
        """
        plt.subplot(211)
        plt.title("Accuracy"+suffix)
        plt.plot(history.history['acc'],color="g",label="Train")
      
        plt.legend(loc="best")

        plt.subplot(212)
        plt.title("Loss")
        plt.plot(history.history['loss'],color="g",label="Train")
       
        plt.legend(loc="best")

        plt.tight_layout()
        plt.show()
        """

        w = xlwt.Workbook()
        sheet2 = w.add_sheet("准备文件", cell_overwrite_ok=True)
        sheet2.write(0, 0, "predict")
        sheet2.write(0, 1, "ytest")
        sheet2.write(0, 2, "xtest")
        sheet2.write(0, 3, "ex_tag")
        sheet2.write(0, 4, "loss")
        sheet2.write(1, 4, loss)
        sheet2.write(0, 5, "acc")
        sheet2.write(1, 5, accuracy)
        ypred = model.predict_classes(xtest, 1)
        xtest = tokenizer.sequences_to_texts(xtest)
        for index in range(0, len(ypred)):
            sheet2.write(1 + index, 0, int(ypred[index][0]))
            sheet2.write(1 + index, 1, ytest[index])
            sheet2.write(1 + index, 2, xtest[index])
            if round == 1:
                sheet2.write(1 + index, 3, ex_tag[index])
        if round == 1:
            ac_sum = []
            for i in range(0, len(ypred)):
                ac = 0
                count = 1
                ac = ac + int(ypred[i][0])
                for i2 in range(i, len(ypred)):
                    if i != len(ypred) - 1 and ex_tag[i] == ex_tag[i + 1]:
                        i = i + 1
                        ac = ac + int(ypred[i][0])
                        count = count + 1
                    else:
                        acca = ac / count
                        if acca >= 0.5:
                            ac_sum.append("1")
                        else:
                            ac_sum.append("0")
                        break
            right = 0
            refer_tag = sheet_test.col_values(1)
            for i in range(0, len(ac_sum)):
                if refer_tag[i] == ac_sum[i]:
                    right += 1
            acr = right / len(ac_sum)
            sheet2.write(0, 7, "acc")
            sheet2.write(1, 7, acr)
        if round == 0:
            w.save("result切割" + suffix + "w2v.xls")
        else:
            w.save("result扩充" + suffix + "w2v.xls")
model.fit(train_x,
          to_categorical(train_y),
          epochs=1,
          verbose=1,
          shuffle=True,
          validation_split=0.2)

#evaluate the model on separate validation
loss, accuracy = model.evaluate(valid_x, to_categorical(valid_y), verbose=1)
print('Accuracy: %f' % (accuracy * 100), loss)
############################################################
############################################################
############################################################
############################################################
############################################################
#print the precision and recall numbers
predicted = model.predict(valid_x)
predicted = np.argmax(predicted, axis=1)
#get the actual labels
predicted_cats = cats.inverse_transform(predicted)
valid_y_cats = cats.inverse_transform(valid_y)
print(predicted_cats)
print(classification_report(valid_y_cats, predicted_cats))
pred_df = pd.DataFrame()
pred_df['valid_x__post_cleaned'] = t.sequences_to_texts(valid_x)
pred_df['actual_label'] = valid_y_cats
pred_df['predictions'] = predicted_cats
pred_df.to_csv('predictions.csv')
#print(pred_df)
#if want to see the classes wrt to doc cleaned text
input()
Beispiel #12
0
    tokenizer.fit_on_texts(text)
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    for line in text:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i + 1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words


text_sequences, total_words = get_sequence_tokens(clean_lines[:1000])
print(text_sequences[:5])
print(tokenizer.sequences_to_texts(text_sequences[:5]))
print(f"Length of sequences array {len(text_sequences)}")
"""As you can see from the example above, the sentence is converted into N number of words.

Now while feeding data, we require the number of inputs to be constant.

However every sentence will not have the same number of words in it.

Hence, inorder to adjust for this varying length we pad the sentences (pre or post).

The maximum sequence size is defined by the maximum length in the input_sequences.
"""


def generate_padded_sequences(input_sequences):
    max_sequence_length = max([len(x) for x in input_sequences])
Beispiel #13
0
    #callbacks=[tensorboard, model_checkpoint],
    verbose=1)

# get the loss and metrics
result = model.evaluate(X_test, y_test)
# extract those
loss = result[0]
accuracy = result[1]
precision = result[2]
recall = result[3]

# Check for prediction result
predVal = model.predict(X_test)

# Convert question from number to text
X_test_N = tokenizer.sequences_to_texts(X_test)

#
X_test_N = pd.DataFrame(X_test_N)

Pred_One = pd.DataFrame(data=predVal, columns=["zero", "One"])

X_test_N["OneProb"] = Pred_One["One"]
X_test_N["ZeroProb"] = Pred_One["zero"]

#total spams predicted
X_test_N[X_test_N["OneProb"] >= 0.50]

X_test_N.shape

import unittest

from keras.preprocessing.text import Tokenizer
import numpy as np

char_tokenizer = Tokenizer(char_level=True)
text = 'The quick brown fox jumped over the lazy dog'
char_tokenizer.fit_on_texts(text)
seq = char_tokenizer.texts_to_sequences(text)
print(seq)

char_tokenizer.sequences_to_texts(seq)
char_vectors = char_tokenizer.texts_to_matrix(text)
print(char_vectors)

print(char_vectors.shape)

print(char_vectors[0])

np.argmax(char_vectors[0])
print(char_tokenizer.index_word)

print(char_tokenizer.word_index)

print(char_tokenizer.index_word[np.argmax(char_vectors[0])])

# ### Adding this method below just for test case and is not present in the exercise


def get_one_hot_vector(text_, word_index):
    char_tokenizer.fit_on_texts(text_)
Beispiel #15
0
class NERSlotFiller(object):
    """NER Slot Classifier."""
    def __init__(self, maxlen=50, vocab_size=10000):
        """Init."""
        self.ner = None
        self.maxlen = maxlen
        self.vocab_size = vocab_size

    def fit(self, sentence_result, slot_result):
        """Fit model."""
        self.tokenizer = Tokenizer(num_words=self.vocab_size,
                                   char_level=True,
                                   lower=False)
        self.tokenizer.fit_on_texts(sentence_result)
        seq = self.tokenizer.texts_to_sequences(sentence_result)
        seq_pad = pad_sequences(seq, maxlen=self.maxlen)

        self.tokenizer_y = Tokenizer(num_words=self.vocab_size,
                                     char_level=True,
                                     lower=False)
        self.tokenizer_y.fit_on_texts(slot_result)
        seq_y = self.tokenizer_y.texts_to_sequences(slot_result)
        seq_pad_y = pad_sequences(seq_y, maxlen=self.maxlen)

        self.ner = get_model(len(self.tokenizer_y.word_index) + 1)
        self.ner.model.fit(seq_pad, to_categorical(seq_pad_y), epochs=5)

    def predict_slot(self, nlu_obj):
        """Predict Slot."""
        tokens = nlu_obj['tokens']
        ret = self.predict([tokens])
        ner_ret = get_slots_detail(tokens, ret[0][-len(tokens):])
        nlu_obj['ner_slot_filler'] = {'slots': ner_ret}
        for slot in ner_ret:
            slot['from'] = 'ner_slot_filler'
        if len(nlu_obj['slots']) <= 0:
            nlu_obj['slots'] = ner_ret
        else:
            for slot in ner_ret:
                is_include = False
                for s in nlu_obj['slots']:
                    if slot['pos'][0] >= s['pos'][0] and slot['pos'][0] <= s[
                            'pos'][1]:
                        is_include = True
                        break
                    elif slot['pos'][1] >= s['pos'][0] and slot['pos'][1] <= s[
                            'pos'][1]:
                        is_include = True
                        break
                    elif s['pos'][0] >= slot['pos'][0] and s['pos'][0] <= slot[
                            'pos'][1]:
                        is_include = True
                        break
                    elif s['pos'][1] >= slot['pos'][0] and s['pos'][1] <= slot[
                            'pos'][1]:
                        is_include = True
                        break
                if not is_include:
                    nlu_obj['slots'].append(slot)
                    nlu_obj['slots'] = sorted(nlu_obj['slots'],
                                              key=lambda x: x['pos'][0])

        return nlu_obj

    def predict(self, sentence_result):
        """Predict sentence."""
        assert self.ner is not None, 'model not fitted'
        seq = self.tokenizer.texts_to_sequences(sentence_result)
        seq_pad = pad_sequences(seq, maxlen=self.maxlen)
        y_pred = self.ner.predict_proba(seq_pad).argmax(-1)
        y_pred = self.tokenizer_y.sequences_to_texts(y_pred)
        y_pred = tuple(
            [y.split(' ')[-len(s):] for s, y in zip(sentence_result, y_pred)])
        return y_pred

    def eval(self, sentence_result, slot_result):
        """Evaluate."""
        y_pred = self.predict(sentence_result)
        y_test = slot_result
        acc = 0
        bad = []
        for sent, real, pred in zip(sentence_result, y_test, y_pred):
            real_slot = get_slots(sent, real)
            pred_slot = get_slots(sent, pred)
            a = get_exact_right(real_slot, pred_slot)
            acc += a
            if not a:
                bad.append((sent, real, pred, real_slot, pred_slot))
        acc /= len(sentence_result)
        return acc, bad
Beispiel #16
0
        model.load_weights(Model_File)
    except:
        print("model error")
    model.fit(x_train, y_train, epochs=100, verbose=2)
    test = x_train[:1]

    result = []
    i = 0
    while True:
        char = model.predict(test)
        result.append([float(np.argmax(char))])
        test = [np.append(test[:, 1:], np.argmax(char))]
        test = np.array(test)
        i += 1
        if i > 50:
            break
    ttt = tokenizer.sequences_to_texts(result)
    print(ttt)

else:
    model.fit(x_train, y_train, epochs=100, verbose=2)
    model.save_weights(Model_File)
# for diversity in [1.0]:
#     generated = ''
#     sentence = ['문재인']
#     for i in range(500):
#         preds = model.predict(x, verbose=0)[0]
#         next_index = sample(preds, diversity)
#         next_char = indices_char[next_index]
#         generated += next_char
#         sentence = sentence[1:] + next_char
Beispiel #17
0
class NeuralClassifier:
    """

    """
    def __init__(self):
        """Initializes a neural classifier's attributes

        """
        # a list of tuples of (type, data_clean, true_label)
        self.labelled_data = []
        self.labelled_validation_data = []
        self.model = None
        self.tokenizer = None
        self.labels = []
        self.label_encoder = None

    #force

    def pickle(self, fname, keep_data=False):
        """Pickles this classifier

        Parameters
        ----------
        fname : a file name
        keep_data : if test/validation data should be kept (will increase size of file)


        """
        with open(fname, 'w') as f:
            if keep_data:
                pickle.dump(self, f)
            else:
                temp_l_data = self.labelled_data
                temp_v_data = self.labelled_validation_data
                self.labelled_data = []
                self.labelled_validation_data = []
                pickle.dump(self, f)
                self.labelled_data = temp_l_data
                self.labelled_validation_data = temp_v_data

    def to_pred(self, pred):
        """

        Parameters
        ----------
        pred : array_like
            A real vector st len(pred) == len(self.labels)

        Returns
        -------
        str
            The label string at the index of the first maximal value of pred

        """
        maxi = 0
        for i in range(1, len(pred)):
            if pred[i] > maxi:
                maxi = i
        return self.labels[maxi]

    def to_pred_comparison(self, pred):
        """

        Parameters
        ----------
        pred : array_like
            A real vector st len(pred) == len(self.labels)

        Returns
        -------
        array_like
            An array of tuples of (labels, prediction_prob) for each value in pred,
            in descending order by probability

        """
        probs = [(self.labels[i], pred[i]) for i in range(len(pred))]
        probs.sort(key=lambda x: x[1], reverse=True)
        return probs

    def add_data(self, file_id: str, data: str, true_label):
        """Adds the given data point to this model's data

        Parameters
        ----------
        file_id : str
            an id for the the file this data point is drawn from
        data : str
        true_label
            The true label for this daa point

        """

        # CURRENTLY NOT TAKING IN PRE-TOKENIZED FILE, DISCUSS WITH TEAM ABOUT ALTERING CLASSIFIER INTERFACE
        if true_label not in self.labels:
            self.labels.append(true_label)
        self.labelled_data.append((file_id, data, true_label))

    def add_validation_data(self, file_id: str, data: str, true_label: int):
        """Adds the given data point to this model's validation data

        Parameters
        ----------
        file_id : str
            an id for the the file this data point is drawn from
        data : str
        true_label
            The true label for this daa point
        """
        if true_label not in self.labels:
            self.labels.append(true_label)
        self.labelled_validation_data.append((file_id, data, true_label))

    def train(self,
              max_number_tokens=neural_constants.MAX_NUMBER_TOKENS,
              slice_length=neural_constants.SLICE_LENGTH,
              slice_overlap=neural_constants.SLICE_OVERLAP,
              glove_file=neural_constants.GLOVE_FILE,
              glove_dimensions=neural_constants.GLOVE_DIMENSIONS,
              diagnostic_printing=False,
              num_epochs=10,
              batch_size=5):
        """

        Parameters
        ----------
        max_number_tokens : int, optional
            The maximum number of distinct tokens allowed by the tokenizer.
            With more data, this value should increase
        slice_length : int, optional
            The length of the subslices sent that are sent through the model.
            With more data, this value should increase
            This value should probably not be greater than half the length of a typical document
        slice_overlap : float, optional
            The percent of each slice that is overlapped with its neigbors
            This value should be in the range [0,1), but probably not above .2
        glove_file : str, optional
            The .txt file containing the glove embeddings to use for this classifier
        glove_dimensions : str, optional
            The number of dimensions of the given glove_file
        diagnostic_printing : bool, optional
            True to run output some statistics on all validation data
        num_epochs : int, optional
            The number of epochs to train the model for.
            Determined experimentally
        batch_size : int, optional
            The batch size to use when training the model
            Determined experimentally

        """

        has_validation = len(self.labelled_validation_data) > 0
        # create the tokenizer
        self.tokenizer = Tokenizer(num_words=max_number_tokens)
        training_data = [text for _, text, _ in self.labelled_data]
        self.tokenizer.fit_on_texts(training_data)
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.labels)

        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.labels)
        # now build our training data_clean
        X_train = self.tokenizer.texts_to_sequences(training_data)

        if has_validation:
            X_validation = self.tokenizer.texts_to_sequences(
                [text for _, text, _ in self.labelled_validation_data])

        X_train, y_train_labels = data_slicer.slice_data(
            X_train, [y for _, _, y in self.labelled_data],
            slice_length=slice_length,
            overlap_percent=slice_overlap)
        if has_validation:
            X_validation, y_validation_labels = data_slicer.slice_data(
                X_validation, [y for _, _, y in self.labelled_validation_data],
                slice_length=slice_length,
                overlap_percent=slice_overlap)
        # convert labels to 1-hots
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.labels)

        y_train = np_utils.to_categorical(
            self.label_encoder.transform(y_train_labels))
        if has_validation:
            y_validation = np_utils.to_categorical(
                self.label_encoder.transform(y_validation_labels))

        # pad them as necessary
        if has_validation:
            X_validation = np.array(
                pad_sequences(X_validation,
                              padding="post",
                              maxlen=slice_length))
        X_train = pad_sequences(X_train, padding="post", maxlen=slice_length)

        # force change

        # get our glove embeddings
        glove = load_glove(glove_file, self.tokenizer.word_index,
                           glove_dimensions)

        # compute some neural_constants
        vocab_size = len(self.tokenizer.word_index) + 1

        # set model parameters
        self.model = Sequential()

        model_layers = [
            # must have these two layers firsts
            layers.Embedding(vocab_size,
                             glove_dimensions,
                             weights=[glove],
                             input_length=slice_length,
                             trainable=False),
            # now we have some options

            # as more data becomes available, a more optimal sequence of inner layers
            # may be discoverable
            layers.GlobalMaxPool1D(),
            layers.Dense(45, activation="relu"),
            layers.Dense(20, activation="sigmoid"),

            # final layer for the output probability distribution
            layers.Dense(len(self.labels), activation="softmax")
        ]
        # add them in
        for layer in model_layers:
            self.model.add(layer)
        self.model.compile(optimizer="adam",
                           loss="categorical_crossentropy",
                           metrics=["accuracy"])
        """
        print(np.shape(X_train))
        print(np.shape(y_train))
        print(np.shape(X_validation))
        print(np.shape(y_validation))
        """

        #X_train, y_train = shuffle_parallel_arrays(X_train, y_train)

        # now we fit (can take a while)
        if has_validation:
            self.model.fit(X_train,
                           y_train,
                           epochs=num_epochs,
                           verbose=False,
                           shuffle=True,
                           validation_data=(X_validation, y_validation),
                           batch_size=batch_size)
        else:
            self.model.fit(X_train,
                           y_train,
                           epochs=num_epochs,
                           verbose=False,
                           shuffle=True,
                           batch_size=batch_size)
        if diagnostic_printing and has_validation:

            def cm(true, pred):
                m = confusion_matrix(true, pred)
                print("Confusion matrix")
                print("   {0:3s} {1:3s}".format("P+", "P-"))
                print("T+ {0:<3d} {1:<3d}".format(m[1][1], m[0][1]))
                print("T- {0:<3d} {1:<3d}".format(m[1][0], m[0][0]))

            y_train_pred = [
                x for x in list(self.model.predict(X_train, verbose=False))
            ]
            y_validation_pred = [
                x
                for x in list(self.model.predict(X_validation, verbose=False))
            ]

            loss, acc = self.model.evaluate(X_train, y_train, verbose=False)
            print("Train L/A asd: {0:.4f} {1:.4f}".format(loss, acc))
            # cm(y_train, y_train_pred)
            loss, acc = self.model.evaluate(X_validation,
                                            y_validation,
                                            verbose=False)
            print("Validation L/A: {0:.4f} {1:.4f}".format(loss, acc))
            #cm(y_validation, y_validation_pred)

            nc = 0
            for i in range(len(X_validation)):
                print(y_validation_labels[i],
                      self.to_pred(y_validation_pred[i]), y_validation_pred[i])
                if y_validation_labels[i] == self.to_pred(
                        y_validation_pred[i]):
                    nc += 1
            print("acc:", nc / len(y_validation_labels))

    def predict(self,
                str,
                slice_length=neural_constants.SLICE_LENGTH,
                slice_overlap=neural_constants.SLICE_OVERLAP):
        """

        Parameters
        ----------
        str : str
            a string of text to predict
        slice_length : int, optional
            the slice length to use. Should match the model's slice length
        slice_overlap : float, optional
            The percent of each slice that is overlapped with its neigbors
            This value should be in the range [0,1), but probably not above .2

        Returns
        -------
        distribution: array_like
            The probability distribution s.t. distribution[i] == P(label of str == self.labels[i])
            Where len(distribution) == len(self.labels)
            And sum(distribution) == 1
            And for all i distribution[i] >= 0
        """
        tokenized = self.tokenizer.texts_to_sequences([str])
        slices, _ = data_slicer.slice_data(tokenized,
                                           None,
                                           slice_length=slice_length,
                                           overlap_percent=slice_overlap)
        #print(slices)
        X = np.array(pad_sequences(slices, padding="post",
                                   maxlen=slice_length))
        #print(X)
        predictions = [x for x in list(self.model.predict(X, verbose=False))]

        s = predictions[0]
        for p in predictions[1:]:
            for i in range(len(s)):
                s[i] += p[i]
        return self.to_pred_comparison([x / sum(s) for x in s])

    def slice_and_predict(self,
                          str,
                          slice_length=neural_constants.SLICE_LENGTH,
                          slice_overlap=neural_constants.SLICE_OVERLAP):
        """Slices and predicts the input string for each slice

        Parameters
        ----------
        str : str
            a string of text to predict
        slice_length : int, optional
            the slice length to use. Should match the model's slice length
        slice_overlap : float, optional
            The percent of each slice that is overlapped with its neigbors
            This value should be in the range [0,1), but probably not above .2

        Returns
        -------
        distribution: array_like
            The probability distribution s.t. distribution[i] == P(label of str == self.labels[i])
            Where len(distribution) == len(self.labels)
            And sum(distribution) == 1
            And for all i distribution[i] >= 0
        """
        tokenized = self.tokenizer.texts_to_sequences([str])
        slices, _ = data_slicer.slice_data(tokenized,
                                           None,
                                           slice_length=slice_length,
                                           overlap_percent=slice_overlap)
        restored = self.tokenizer.sequences_to_texts(slices)
        #print(slices)
        X = np.array(pad_sequences(slices, padding="post",
                                   maxlen=slice_length))
        #print(X)
        predictions = [x for x in list(self.model.predict(X, verbose=False))]
        return [(self.to_pred(predictions[i]), restored[i])
                for i in range(len(slices))]
Beispiel #18
0
from keras.layers import Dense, Embedding, LSTM, Flatten, Conv1D, MaxPool1D, BatchNormalization, Dropout
from keras.models import Sequential
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

(x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=2000)
words = imdb.get_word_index()
# print(words)
token = Tokenizer()
token.fit_on_texts(words)
a = token.sequences_to_texts(x_train)
print(a)
# print(x_train.shape, x_test.shape)  # (8982,) (2246,)
# print(y_train.shape, y_test.shape)  # (8982,) (2246,)

# print(x_train[0]) #(25000,) (24996,)
# print(y_train[0]) #(25000,) (24996,)
#
# print(len(x_train[0])) #218
test = []
for x in x_train[0]:
    test.append(words)
print(test)

category = np.max(y_train) + 1
# print("카테고리 : ", category)  # 카테고리 :  2
Beispiel #19
0
class ConceptTokenizer:
    unused_token = ['[UNUSED]']
    mask_token = ['[MASK]']

    def __init__(self,
                 special_tokens: Optional[Sequence[str]] = None,
                 oov_token='0'):
        self.special_tokens = special_tokens
        self.tokenizer = Tokenizer(oov_token=oov_token,
                                   filters='',
                                   lower=False)

    def fit_on_concept_sequences(self, concept_sequences):
        self.tokenizer.fit_on_texts(concept_sequences)
        self.tokenizer.fit_on_texts(self.mask_token)
        self.tokenizer.fit_on_texts(self.unused_token)
        if self.special_tokens is not None:
            self.tokenizer.fit_on_texts(self.special_tokens)

    def encode(self, concept_sequences):
        return self.tokenizer.texts_to_sequences(concept_sequences)

    def decode(self, concept_sequence_token_ids):
        return self.tokenizer.sequences_to_texts(concept_sequence_token_ids)

    def get_all_token_indexes(self):
        all_keys = set(self.tokenizer.index_word.keys())

        if self.tokenizer.oov_token is not None:
            all_keys.remove(
                self.tokenizer.word_index[self.tokenizer.oov_token])

        if self.special_tokens is not None:
            excluded = set([
                self.tokenizer.word_index[special_token]
                for special_token in self.special_tokens
            ])
            all_keys = all_keys - excluded
        return all_keys

    def get_first_token_index(self):
        return min(self.get_all_token_indexes())

    def get_last_token_index(self):
        return max(self.get_all_token_indexes())

    def get_vocab_size(self):
        # + 1 because oov_token takes the index 0
        return len(self.tokenizer.index_word) + 1

    def get_unused_token_id(self):
        unused_token_id = self.encode(self.unused_token)
        while isinstance(unused_token_id, list):
            unused_token_id = unused_token_id[0]
        return unused_token_id

    def get_mask_token_id(self):
        mask_token_id = self.encode(self.mask_token)
        while isinstance(mask_token_id, list):
            mask_token_id = mask_token_id[0]
        return mask_token_id
Beispiel #20
0
seed_text = ['reach highest level devastation'] # frase inicial
seed_tokens = tokenizer.texts_to_sequences(seed_text)[0] # substitui palavras por tokens

# preenche sequencia com zeros para ter o comprimento adequado pra rede
tokens_x = pad_sequences([seed_tokens], maxlen=seq_len, )
tokens_x = to_categorical(tokens_x, num_classes=vocab_size) # one hot
pred_y = model.predict(tokens_x) # preve probabilidades para a proxima palavra

def sample_word(pred_y, temperature=1.0):
    pred_y = pred_y / temperature # 'força' das probabilidades
    pred_token = tf.random.categorical(pred_y, 1).numpy()
    return pred_token # token de saída

next_token = sample_word(pred_y)
# realiza a inversão de token para palavra
next_word = tokenizer.sequences_to_texts(next_token)
print('Proximo token: ', next_token, '-->', next_word)

"""# **b)** *Após o treinamento, exiba pelo menos 5 exemplos de textos dados de entrada, e do texto gerado em seguida pela rede treinada. Para cada exemplo, gere pelo menos 10 palavras consecutivamente.*"""

def sample_word(pred_y, temperature=1.0):
    pred_y = pred_y / temperature # 'força' das probabilidades
    pred_token = tf.random.categorical(pred_y, 1).numpy()
    return pred_token # token de saída
seed_text = ['true destroy unholy coalition',
             'there amazing wwii story', 
             'lightning hotter surface sun', 
             'mom setting bathroom curfew', 
             'retweet second receive goodnews'] # frase inicial
for rep in range(5):
  for rep2 in range(10):
class SimulacrumGenerator:

    def __init__(self, max_words=1000, max_len=50, num_epochs=10, batch_size=128):
        self.simulacrum_name = os.getenv("SIMULACRUM_NAME")
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.max_words = max_words
        self.max_len = max_len
        self.tok = Tokenizer(num_words=max_words)
        self.processor = DataProcessor(os.getenv("SIMULACRUM_NAME"))
        self.model = self.architecture()
        self.model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

    def architecture(self):
        inputs = Input(name='inputs', shape=[self.max_len])
        layer = Embedding(self.max_words, self.max_len, input_length=self.max_len)(inputs)
        layer = LSTM(64)(layer)
        layer = Dense(self.max_len, name='out_layer')(layer)
        layer = Activation('relu')(layer)
        model = Model(inputs=inputs, outputs=layer)
        return model

    def architecture2(self):
        inputs = Input(name='inputs', batch_shape=(self.batch_size, self.max_len))
        layer = Embedding(self.max_words, self.max_len)(inputs)
        layer = GRU(1024, recurrent_initializer='glorot_uniform', stateful=True)(layer)
        layer = Dense(self.max_len, name='out_layer')(layer)
        # layer = Activation('relu')(layer)
        model = Model(inputs=inputs, outputs=layer)
        return model


    def tokenize_sentences(self, sentences):
        sequences = self.tok.texts_to_sequences(sentences)
        # sequences = []
        # for vector in self.tok.texts_to_sequences(sentences):
        #     sequences.append(np.interp(vector, (0, self.max_words), (0, 1)))
        return sequence.pad_sequences(sequences, maxlen=self.max_len)

    def detokenzie(self, vectors):
        return self.tok.sequences_to_texts((vectors*10000).astype("int"))
        # return self.tok.sequences_to_texts(np.interp(vectors, (0, 1), (0, self.max_words)).astype("int"))

    def create_inputs(self, sentences=None):
        if sentences is None:
            self.processor.extract()
            sentences = self.processor.received
        self.tok.fit_on_texts(sentences)
        # self.max_words = len(sentences)
        return self.tokenize_sentences(sentences)

    def generate(self, sentences=None):
        if sentences is None:
            inputs = self.create_inputs()
        else:
            inputs = self.create_inputs(sentences)
        return np.array(self.model.predict(inputs)), np.zeros(len(inputs))

    def train(self, callbacks=None):
        # cb = [EarlyStopping(monitor='val_loss', min_delta=0.0001)]
        cb=[]
        if callbacks is not None:
            cb.extend(callbacks)

        self.processor.extract()
        train_X = []
        train_y = []
        for pair in self.processor.pairs:
            train_X.append(self.processor.received[pair[1]])
            train_y.append(self.processor.sent[pair[0]])

        self.model.fit(self.create_inputs(train_X), self.create_inputs(train_y), epochs=self.num_epochs,
                       batch_size=self.batch_size, validation_split=0.2, callbacks=cb)


# generator = SimulacrumGenerator()
# outputs, y = generator.generate()
# print(outputs[0], generator.tokenize_sentences(generator.processor.received)[0])
# print(generator.detokenzie(outputs))
class ComparativeNeuralClassifier(CW):
    def __init__(self):
        # a list of tuples of (type, data_clean, true_label)
        self.labelled_data = []
        self.labelled_validation_data = []
        self.labels = set()
        self.models = dict()
        self.tokenizer = None

    #force

    def add_data(self, file_id: str, tokenized_file: str, true_label: int):
        """

		:param file_id: a hashable ID for this particular file
		:param tokenized_file: a
		:param true_label:
		:return: None
		"""

        # CURRENTLY NOT TAKING IN PRE-TOKENIZED FILE, DISCUSS WITH TEAM ABOUT ALTERING CLASSIFIER INTERFACES
        self.labels.add(true_label)
        self.labelled_data.append((file_id, tokenized_file, true_label))

    def add_validation_data(self, file_id: str, data: str, true_label: int):
        """

		:param file_id:
		:param data:
		:param true_label:
		:return:
		"""

        self.labelled_validation_data.append((file_id, data, true_label))

    def get_data(self):
        """

		:return: A structure [(file_id, tokenized_file, true_label),...] for all data_clean added to this classifier with
		the add_data method
		"""
        raise NotImplementedError

    def train(self):
        """
		This classifier object will train on all the data_clean that has been added to it using the adddata method
		:return:
		"""

        # i want to use bagging

        # create the tokenizer
        self.tokenizer = Tokenizer(num_words=constants.MAX_NUMBER_TOKENS)
        training_data = [text for _, text, _ in self.labelled_data]
        self.tokenizer.fit_on_texts(training_data)

        # now build our training data_clean
        X_train = self.tokenizer.texts_to_sequences(training_data)
        X_validation = self.tokenizer.texts_to_sequences(
            [text for _, text, _ in self.labelled_validation_data])

        X_train, y_train = data_slicer.slice_data(
            X_train, [y for _, _, y in self.labelled_data],
            slice_length=constants.SLICE_LENGTH,
            overlap_percent=constants.SLICE_OVERLAP)

        X_validation, y_validation = data_slicer.slice_data(
            X_validation, [y for _, _, y in self.labelled_validation_data],
            slice_length=constants.SLICE_LENGTH,
            overlap_percent=constants.SLICE_OVERLAP)

        # pad them as necessary
        X_train = np.array([
            np.array(x) for x in pad_sequences(
                X_train, padding="post", maxlen=constants.SLICE_LENGTH)
        ])
        X_validation = np.array(
            pad_sequences(X_validation,
                          padding="post",
                          maxlen=constants.SLICE_LENGTH))

        # force change

        # get our glove embeddings
        glove = load_glove(constants.GLOVE_FILE, self.tokenizer.word_index)

        # compute some neural_constants
        vocab_size = len(self.tokenizer.word_index) + 1

        for label in self.labels:
            # set model parameters
            self.models[label] = Sequential()
            model_layers = [
                # must have these two layers firsts
                layers.Embedding(vocab_size,
                                 constants.GLOVE_DIMENSIONS,
                                 weights=[glove],
                                 input_length=constants.SLICE_LENGTH,
                                 trainable=False),
                layers.GlobalMaxPool1D(),
                # now we have some options
                layers.Dense(20, activation="relu"),
                layers.Dense(15, activation="sigmoid"),
                # layers.Dense(10, activation="sigmoid"),
                # probably want a final sigmoid layer to get smooth value in range (0, 1)
                layers.Dense(1, activation="sigmoid")
            ]
            # add them in
            for layer in model_layers:
                self.models[label].add(layer)
            self.models[label].compile(optimizer="adam",
                                       loss="binary_crossentropy",
                                       metrics=["accuracy"])

            y_train_binary = [1 if l == label else 0 for l in y_train]

            # now we fit (can take a while)
            self.models[label].fit(X_train,
                                   y_train_binary,
                                   epochs=25,
                                   verbose=False,
                                   shuffle=True,
                                   validation_data=(X_validation,
                                                    y_validation),
                                   batch_size=10)
        predictions = dict()
        stats = dict()
        for label in self.labels:
            predictions[label] = self.models[label].predict(X_validation,
                                                            verbose=False)
        for label in self.labels:
            stats[label] = {
                "mean": np.mean(predictions[label]),
                "std": np.std(predictions[label]),
                "max": np.max(predictions[label]),
                "min": np.min(predictions[label])
            }

        texts = self.tokenizer.sequences_to_texts(X_validation)

        sorted_labels = sorted(list(self.labels))

        ncorrect = [0] * 4
        n = 0
        with open('classifiers/neural/cnc.csv', 'w', newline='\n') as csvfile:
            csvw = csv.writer(csvfile)
            for i in range(len(y_validation)):

                outputs = [predictions[label][i][0] for label in sorted_labels]
                zscores = [(predictions[label][i][0] - stats[label]["mean"]) /
                           stats[label]["std"] for label in sorted_labels]
                normalized = [
                    (predictions[label][i][0] - stats[label]["min"]) /
                    stats[label]["max"] for label in sorted_labels
                ]
                pred = [
                    np.argmax([(outputs[j] + zscores[j])
                               for j in range(len(outputs))]),
                    np.argmax(outputs),
                    np.argmax(zscores),
                    np.argmax(normalized)
                ]

                n += 1
                for j in range(len(pred)):
                    if pred[j] == y_validation[j]:
                        ncorrect[j] += 1

                row = [y_validation[i]] \
                      + normalized \
                      + outputs  \
                      + zscores \
                      + pred \
                      + [texts[i]]
                csvw.writerow(row)

        print(ncorrect)
        print([x / n for x in ncorrect])
        print(n)

    def predict(self, tokenized_file: str, minimum_confidence=.8):
        """

		:param tokenized_file: the array containing the ordered, sanitized word tokens from a single file
		:param minimum_confidence: the minimum confidence level required to the classifier to label a data_clean point as
		any given class. Only used by applicable classifiers.
		:return: a list of tuples of [(class label, confidence)] for each class label where confidence >
		minimum_confidence. Confidence will be 1 for classifiers where confidence is not a normally used feature.
		"""

        raise NotImplementedError
Beispiel #23
0
model.add(Embedding(vocab_size,embedding_size,input_length=maxlen))
model.add(LSTM(256))

model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(512,activation='relu'))

model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(3,activation = 'softmax'))

# review the model
model.compile(loss= 'categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

# begin training on dataset
batch_size = 128
num_epochs = 5
model.fit(x_train1,y_train1,validation_data= (x_val,y_val),batch_size = batch_size, epochs= num_epochs)

# check accuracy on test data
scores = model.evaluate(x_test,y_test,verbose=0)
print("accuracy:",str(scores[1]))

u=8
print(tk.sequences_to_texts(x_test[u:u+1]))
print(model.predict(x_test[u:u+1]))
Beispiel #24
0
class FastTextModel(SupervisedBaseModel):
    def __init__(self, task):
        super(FastTextModel, self).__init__(task)
        self.args = task.args
        self.epochs = 15
        self.max_len = 50
        self.batch_size = 32
        self.max_features = 5000
        self.embeddings_dim = self.args.embeddings_size
        self.embeddings_matrix = None
        self.ngram_range = 1
        self.tokenizer = Tokenizer(num_words=self.max_features)
        self.model = None
        self.token_indice = None
        self.num_labels = len(self.args.labels)

    def build_model(self):
        print('Build model...')
        model = Sequential()
        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        weights = None if self.embeddings_matrix is None else [
            self.embeddings_matrix
        ]
        model.add(
            Embedding(
                self.max_features,
                self.embeddings_dim,
                input_length=self.max_len,
                #trainable=False,
                weights=weights,
                mask_zero=True), )
        # we add a GlobalAveragePooling1D, which will average the embeddings
        # of all words in the document
        model.add(GlobalAveragePooling1D())
        # We project onto a single unit output layer, and squash it with a sigmoid:
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        return model

    def show_text_info(self, X_text):
        print(len(X_text), 'text sequences')
        X_text_lens = list(map(len, X_text))
        print('Average sequence length: {}'.format(
            np.mean(X_text_lens, dtype=int)))
        print('Max sequence length: {}'.format(np.max(X_text_lens)))

    def add_ngrams(self, X_text):
        if self.ngram_range == 1:
            return X_text

        if self.token_indice is None:
            print('Adding {}-gram features'.format(self.ngram_range))
            # Create set of unique n-gram from the training set.
            ngram_set = set()
            for input_list in X_text:
                for i in range(2, self.ngram_range + 1):
                    set_of_ngram = create_ngram_set(input_list, ngram_value=i)
                    ngram_set.update(set_of_ngram)

            # Dictionary mapping n-gram token to a unique integer.
            # Integer values are greater than max_features in order
            # to avoid collision with existing features.
            start_index = self.max_features + 1
            self.token_indice = {
                v: k + start_index
                for k, v in enumerate(ngram_set)
            }
            indice_token = {self.token_indice[k]: k for k in self.token_indice}

            # max_features is the highest integer that could be found in the dataset.
            self.max_features = np.max(list(indice_token.keys())) + 1

        # Augmenting x_train and x_test with n-grams features
        X_text = add_ngram(X_text, self.token_indice, self.ngram_range)
        self.show_text_info(X_text)
        return X_text

    def fit_text(self, X_text, y=None):

        X_unlabeled = self.dataset.X_train_unlabeled.values
        X_unlabeled_text = X_unlabeled[:, self.args.TEXT_COL]
        X = np.append(X_text, X_unlabeled_text, axis=0)

        #X = self.preprocess_text(X)
        self.tokenizer.fit_on_texts(X)
        X = self.tokenizer.texts_to_sequences(X)
        X = self.tokenizer.sequences_to_texts(X)
        self.text_rep_model = self.build_fit_w2v(X)

    def transform_text(self, X_text):
        X = self.tokenizer.texts_to_sequences(X_text)
        X = self.tokenizer.sequences_to_texts(X)
        X = self.transform_text_to_w2v(self.text_rep_model, X)
        return X

    def preprocess_text(self, X_text):
        self.tokenizer.fit_on_texts(X_text)

        num_words = len(self.tokenizer.word_index)
        #self.max_features = np.minimum(self.max_features, num_words) + 1 # add padding
        self.max_features = num_words + 1  #add paddings

        self.embeddings_matrix = get_embedding_vectors(
            self.args.embeddings_path, self.tokenizer.word_index,
            self.max_features, self.embeddings_dim)

        X_text = self.tokenizer.texts_to_sequences(X_text)
        self.show_text_info(X_text)
        X_text = self.add_ngrams(X_text)
        self.max_len = int(np.max(list(map(len, X_text))))
        X = sequence.pad_sequences(X_text, maxlen=self.max_len)
        return X

    def train(self, X, y):
        print('TRAINING')
        X, y = self.augment_instances(X, y)
        # convert to sequences
        X_text = X[:, self.args.TEXT_COL]

        X_text = self.preprocess_text(X_text)

        X = X_text  # todo: add other features

        self.model = self.build_model()

        self.model.fit(
            X,
            y,
            batch_size=self.batch_size,
            epochs=self.epochs,
            #validation_data=(x_test, y_test)
        )

    def predict(self, X):
        print('PREDICT')
        X_text = X[:, self.args.TEXT_COL]
        X_text = self.tokenizer.texts_to_sequences(X_text)
        self.show_text_info(X_text)

        X_text = self.add_ngrams(X_text)

        X = sequence.pad_sequences(X_text, maxlen=self.max_len)
        y = self.model.predict(X, verbose=1)
        y = (y > 0.5).astype(int)
        return y
Beispiel #25
0
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

num_train = int(len(encoder_inputs) * 0.9)

train_generator = SequenceGenerator(encoder_inputs[0:num_train], decoder_inputs[0:num_train],
                                    decoder_targets[0:num_train], args.batch_size, max_len_target, args.hidden_dim_decoder, num_words_output)
validation_generator = SequenceGenerator(encoder_inputs[num_train:], decoder_inputs[num_train:],
                                         decoder_targets[num_train:], args.batch_size, max_len_target, args.hidden_dim_decoder, num_words_output)

print('Start training')

callbacks = [TensorBoard(os.path.join(args.logs_dir, 'attention-{0}'.format(datetime.now().isoformat().replace(':','-').split('.')[0]))),
             ModelCheckpoint(os.path.join(args.models_dir, 'weights.{epoch:02d}-{val_loss:.2f}.h5'), save_best_only=True)]

r = model.fit_generator(
    generator=train_generator,
    steps_per_epoch=len(train_generator),
    epochs=args.epochs,
    validation_data=validation_generator,
    validation_steps=len(validation_generator),
    callbacks=callbacks,
    initial_epoch=args.initial_epoch
    )

for i in range(min(20, len(encoder_inputs))):
  output = inference_model.predict(np.array(encoder_inputs[i:i+1]), tokenizer_outputs.word_index['<sos>'], tokenizer_outputs.word_index['<eos>'])
  output_sentences = tokenizer_outputs.sequences_to_texts([list(output)])
  print(input_texts[i], '<qa>', output_sentences[0])
Beispiel #26
0
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_train)
tokenizer.num_words=2000
X_train = tokenizer.texts_to_sequences(review_train)
X_test = tokenizer.texts_to_sequences(review_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index


maxlen = 15

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

X_train = tokenizer.sequences_to_texts(X_train)
X_test = tokenizer.sequences_to_texts(X_test)

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X_train)
X_train = tfidf_vectorizer.transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

X_train=X_train.toarray()
X_test=X_test.toarray()


X_train=X_train.reshape(130967, 1979,1)
X_test=X_test.reshape(32742, 1979,1)

Beispiel #27
0
# vectorizing text, turning each text into sequence of integers
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(X)
# convert to sequence of integers
X = tokenizer.texts_to_sequences(X)
# convert to numpy arrays
X = np.array(X)
y = np.array(y)

# padding sequences at the beginning of each sequence with 0's to SEQUENCE_LENGTH
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)

y = [label2int[label] for label in y]
y = np.asarray(y, dtype=np.float32)

XSpamText = tokenizer.sequences_to_texts(X[y == 1])
XHamText = tokenizer.sequences_to_texts(X[y == 0])

analysis = Analysis(XHamText, XSpamText)

# split and shuffle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)

split_frac = 0.5  # 50% validation, 50% test

split_id = int(split_frac * len(X_test))
X_val, X_test = X_test[:split_id], X_test[split_id:]
y_val, y_test = y_test[:split_id], y_test[split_id:]

train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
print("[ INFO ][ Features loaded ][ Length: {} ]".format(len(features)))

lst = list()
for v in caps.values():
    [lst.append(words) for words in v]

maxLen = max([len(words.split()) for words in lst])  # longest caption

tknzr = Tokenizer()
tknzr.fit_on_texts(lst)
vocab_size = len(tknzr.word_index) + 1

encoder_in, decoder_in, decoder_out = list(), list(), list()
for img, sents in caps.items():
    for cap in sents:
        seq = tknzr.sequences_to_texts([cap])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=maxLen)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

            encoder_in.append(features[img])
            decoder_in.append(in_seq)
            decoder_out.append(out_seq)

inputs1 = Input(shape=(None, 25088))
en1 = Dropout(0.3)(inputs1)
en2 = Dense(latent_dims, activation='relu')(en1)
encoder_outputs, state_h, state_c = LSTM(latent_dims, return_state=True)(en1)
encoder_states = [state_h, state_c]
Beispiel #29
0
from keras.preprocessing.text import Tokenizer

text = "나는 맛있는 밥을 먹었다"

token = Tokenizer()
token.fit_on_texts([text])

print(token.word_index)

x = token.texts_to_sequences([text])
print(x)

print(token.sequences_to_texts(x))

from keras.utils import to_categorical

word_size = len(token.word_index) + 1
x = to_categorical(x, num_classes=word_size)
print(x)
Beispiel #30
0
def model(suffix="", suffix_fre=""):
    for round in range(0, 2):
        rtest = xlrd.open_workbook(filename="切割" + suffix + "test" +
                                   suffix_fre + ".xls")
        rtrain = xlrd.open_workbook(filename="切割" + suffix + "train" +
                                    suffix_fre + ".xls")
        r_vocall1 = xlrd.open_workbook(filename="pre处理" + suffix + "test" +
                                       suffix_fre + ".xls")
        r_vocall2 = xlrd.open_workbook(filename="pre处理" + suffix + "train" +
                                       suffix_fre + ".xls")
        sheet_test = rtest.sheet_by_index(0)
        sheet_train = rtrain.sheet_by_index(0)
        sheet1_vocall = r_vocall1.sheet_by_index(0)
        sheet2_vocall = r_vocall2.sheet_by_index(0)
        invocal1 = sheet1_vocall.col_values(4)

        invocal2 = sheet2_vocall.col_values(4)
        for i in range(0, len(invocal1)):
            if len(invocal1[i]) == 0:
                invocall = invocal1[:i]
                print("1")
                break
            if i == len(invocal1) - 1:
                invocall = invocal1

        for i in range(0, len(invocal2)):
            if len(invocal2[i]) == 0:
                print("1")
                invocal2 = invocal2[:i]
                break
        for i in invocal2:
            if i not in invocall:
                invocall.append(i)
        print(len(invocall))
        vocall_size = len(invocall)
        if round == 1:
            ex_tag = sheet_test.col_values(6)
        xtrain = sheet_train.col_values(2 + round * 3)
        ztrain = sheet_train.col_values(0 + round * 3)
        ytrain = sheet_train.col_values(1 + round * 3)
        xtest = sheet_test.col_values(2 + round * 3)
        ztest = sheet_test.col_values(0 + round * 3)
        ytest = sheet_test.col_values(1 + round * 3)

        for i in range(0, len(xtrain)):
            if len(xtrain[i]) == 0:
                xtrain = xtrain[:i]
                ztrain = ztrain[:i]
                ytrain = ytrain[:i]
                break
        for i in range(0, len(xtest)):
            if len(xtest[i]) == 0:
                xtest = xtest[:i]
                ytest = ytest[:i]
                ztest = ztest[:i]
                break
        print(round * 3)
        print(len(xtrain), "xtrain")
        print(len(ytrain), "ytrain")
        print(len(xtest), "xtest")
        print(len(ytest), "ytest")
        if round == 1:
            other = sheet_train.cell(0, 13).value
            other = int(other)
            print(other)
            if other == 1:
                xtrain = xtrain + sheet_train.col_values(9)
                ytrain = ytrain + sheet_train.col_values(8)
                ztrain = ztrain + sheet_train.col_values(7)
                for i in range(0, len(xtrain)):
                    if len(xtrain[i]) == 0:
                        xtrain = xtrain[:i]
                        ztrain = ztrain[:i]
                        ytrain = ytrain[:i]
                        break

        tokenizer = Tokenizer(num_words=vocall_size)
        tokenizer.fit_on_texts(invocall)
        xtrain = tokenizer.texts_to_sequences(xtrain)
        xtest = tokenizer.texts_to_sequences(xtest)
        maxlen = 0
        for i in xtrain:
            if len(i) > maxlen:
                maxlen = len(i)
        for i in xtest:
            if len(i) > maxlen:
                maxlen = len(i)
        print(maxlen, "maxlen")
        xtrain = pad_sequences(xtrain, padding='post', maxlen=maxlen)
        xtest = pad_sequences(xtest, padding='post', maxlen=maxlen)
        print(len(ytrain), len(xtrain))
        print(len(ytest), len(xtest))
        for i in range(0, len(ytrain)):
            ytrain[i] = int(ytrain[i])
        for i in range(0, len(ytest)):
            ytest[i] = int(ytest[i])
        embedding_size = 150
        hidden_layer_size = 64
        batch_size = 128
        num_epochs = 3
        model = Sequential()
        model.add(Embedding(vocall_size, embedding_size, input_length=maxlen))
        model.add(AT(25, 150))
        model.add(SpatialDropout1D(0.2))
        model.add(LSTM(hidden_layer_size, dropout=0.2, recurrent_dropout=0.2))

        model.add(Dense(10))
        model.add(Dense(1))
        model.add(Activation("sigmoid"))
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        model.summary()

        history = model.fit(xtrain, ytrain, epochs=7, batch_size=64)
        loss, accuracy = model.evaluate(xtest, ytest)
        print(loss, accuracy)
        """
        plt.subplot(211)
        plt.title("Accuracy"+suffix)
        plt.plot(history.history['acc'],color="g",label="Train")
      
        plt.legend(loc="best")

        plt.subplot(212)
        plt.title("Loss")
        plt.plot(history.history['loss'],color="g",label="Train")
       
        plt.legend(loc="best")

        plt.tight_layout()
        plt.show()
        """

        w = xlwt.Workbook()
        sheet2 = w.add_sheet("准备文件", cell_overwrite_ok=True)
        sheet2.write(0, 8, "predict")
        sheet2.write(0, 9, "ytest")
        sheet2.write(0, 10, "xtest")
        sheet2.write(0, 11, "ex_tag")
        sheet2.write(0, 4, "loss")
        sheet2.write(1, 4, loss)
        sheet2.write(0, 5, "acc")
        sheet2.write(1, 5, accuracy)
        ypred = model.predict_classes(xtest, 1)
        xtest = tokenizer.sequences_to_texts(xtest)
        for index in range(0, len(ypred)):
            sheet2.write(index, 0, int(ypred[index][0]))
            sheet2.write(index, 1, ytest[index])
            sheet2.write(index, 2, xtest[index])
            if round == 1:
                sheet2.write(index, 3, ex_tag[index])

        if round == 0:
            w.save("result切割" + suffix + suffix_fre + "at.xls")
        else:
            w.save("result扩充" + suffix + suffix_fre + "at.xls")