# Plotting The Word Cloud For Text That Is Fake
#plt.figure(figsize = (20,20)) 
#wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df[df.isfake == 0].clean_joined))
#plt.imshow(wc, interpolation = 'bilinear')
#plt.show()


# Splitting Data Into Test And Train 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.clean_joined, df.isfake, test_size = 0.2)
from nltk import word_tokenize


# Creating A Tokenizer To Tokenize The Words And Create Sequences Of Tokenized Words
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(x_train)
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# Adding Padding
padded_train = pad_sequences(train_sequences,maxlen = 40, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences,maxlen = 40, truncating = 'post') 


# Sequential Model
model = Sequential()

# Embeddidng layer
model.add(Embedding(total_words, output_dim = 128))
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

docs = [
    'so funny', 'very nice', 'well mad movie', 'suggest this movie',
    'want to see again', 'dumdum'
    'boring movie', 'acting bad', 'not fun', 'boring', 'too boring',
    'very funny', 'he is handsome'
]
# Positive 1, Negative 0
labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1])

token = Tokenizer()
token.fit_on_texts(docs)

print(token.word_index)

x = token.texts_to_sequences(docs)
print(x)

from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_x = pad_sequences(x, padding='pre', maxlen=5)  # post

print(pad_x)
print(np.unique(pad_x))
print(len(np.unique(pad_x)))

#2. Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, Conv1D
Exemple #3
0
num_words = 5000
# Максимальная длина новости
max_news_len = 50
# Количество классов новостей
nb_classes = 5

# Получение тренировочных данных
train = pd.read_csv('data/train_en.csv', header=None, names=['text', 'star'])
reviews = train['text']
y_train = utils.to_categorical(train['star'] - 1, nb_classes)

for i in reviews:
    if type(i) is float:
        print(i)
# Создаем токенизатор
tokenizer = Tokenizer(num_words=num_words)
# Обучаем токенизатор
tokenizer.fit_on_texts(reviews)
# Используем на наших данных
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_news_len)

# Сохраняем токенизатор
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Создание нейронной сети
model_lstm = Sequential()
model_lstm.add(Embedding(num_words, 100, input_length=max_news_len))
model_lstm.add(SpatialDropout1D(0.1))
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.1))
    line = line.replace("(", "")
    line = line.replace(")", "")
    line = line.replace("/", "")
    line = line.replace("\\", "")
    line = line.replace("&", "")
    line = line.replace("#", "")
    line = re.sub('\d', '', line)
    line = line.split(' ')
    line = [w for w in line if not w in stop_words]
    line = str(line)
    line = str(line.strip())[1:-1].replace(' ', ' ')
    strings.append(line)

#encode text as numbers
tok_Len = 100000  # max number of words for tokenizer
tokenizer = Tokenizer(num_words=tok_Len)
tokenizer.fit_on_texts(strings)
sequences = tokenizer.texts_to_sequences(strings)
term_Index = tokenizer.word_index
print('Number of Terms:', len(term_Index))

sen_Len = 98  # max length of each sentences, including padding
tok_Features = pad_sequences(sequences, padding='post', maxlen=sen_Len)
print('Shape of tokenized features tensor:', tok_Features.shape)

indices = np.arange(tok_Features.shape[0])
np.random.shuffle(indices)
time_series = df['created_at_retweets']
time_series.reset_index(drop=True, inplace=True)

time_series = time_series[indices]
Exemple #5
0
print('\nDoing label encoding')
# Here it'll make some labels in the form of numbers
label_encoder = LabelEncoder()
label_encoder.fit(training_labels)
training_labels = label_encoder.transform(training_labels)
print('\nDone label encoding')

vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"

print('\nDoing tokenization')
# Here we're doing the tokenization and have
# some sequences in the sentences taken from users
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)
print('\nDone with tokenization')

# Creating the neural network model
print("\nBuilding the network")
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu', name='Hidden_Layer_1'))
model.add(Dense(16, activation='relu', name='Hidden_Layer_2'))
model.add(Dense(num_classes, activation='softmax', name='Output_Layer'))
Exemple #6
0
 def __init__(self, num_words=None):
     self.num_words = num_words
     if num_words != None:
         self.tokenizer = Tokenizer(num_words=self.num_words)
     else:
         self.tokenizer = Tokenizer()
Exemple #7
0
train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

# In[9]:

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences,
                             padding=padding_type,
                             maxlen=max_length)

print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
Exemple #8
0
url = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json'
#path_to_json = tf.keras.utils.get_file('sarcasm.json', origin=url)
#PATH = os.path.join(os.path.dirname(path_to_json), 'sarcasm')
sarcasm=wget.download(url)

with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)



sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])


tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(len(word_index))
print(word_index)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'Hey! how are you doing?', 'I am doing great! What about you',
    'Well! I had some stuff to do but now I am free, thanks for asking'
]
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

word = tokenizer.word_index

sentence_sequences = tokenizer.texts_to_sequences(sentences)

test_sequence = [
    'Hey, everyone how are you doing ?',
    'Everyone: We are good, lets just party',
]
test_data_sequences = tokenizer.texts_to_sequences(test_sequence)
pad_seq = pad_sequences(sequences)
pad_test_seq = pad_sequences(test_data_sequences)

print(word)
print(sentence_sequences)
print(f'Test sequences: \n {test_data_sequences}')
print(f'Padded sequences for sentences:\n {pad_seq}')
print(f'Padded sequecnes for test data:\n {pad_test_seq}')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'i love my dog', 'i love my cat', 'you  love dog!',
    'Do you  think my dog is amazing?'
]

tokenizer = Tokenizer(num_words=100, oov_token="<RAJ> ")

tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index

#create sequences
sequences = tokenizer.texts_to_sequences(sentences)

#print(word_index)
#print(sequences)

test_data = ['i really love my dog', 'my dog loves my menatee']

test_seq = tokenizer.texts_to_sequences(test_data)

#print(test_seq)

#padding: post ,pre and truncating option : maxlen
padded = pad_sequences(sequences, padding='post', maxlen=8)

print(word_index)
print(sentences)
Exemple #11
0
    tk = loaded_tokenizer.texts_to_sequences([t1])
    for i in range(len(tk)):
        for j in range(22 - len(tk[i])):
            tk[i].insert(0, 0)
    tk = np.array(tk)
    return tk


def predict(loaded_model, tk, le_loaded):
    pred = loaded_model.predict(tk)
    return le_loaded.inverse_transform([np.argmax(pred[0])])


is_first_time = False
model = tf.keras.Model()
tokenizer = Tokenizer(num_words=5000, split=" ")
le = preprocessing.LabelEncoder()


@app.route("/api", methods=["POST", "GET"])
def login():
    global is_first_time
    global model
    global tokenizer
    global le
    tweet = request.get_json()["tweet"]
    if (is_first_time == False):
        is_first_time = True
        model = load_model()
        tokenizer = load_tokenizer()
        le = load_label_encoder()
Exemple #12
0
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

sent = [
    'This is sentence one', 'This is sentence two', 'That is sentence! three',
    'Do you believe this sentence is amazing?'
]

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sent)
word_index = tokenizer.word_index
initial_text_to_sequence = tokenizer.texts_to_sequences(sent)

print(word_index)
print(initial_text_to_sequence)
Exemple #13
0
plt.figure(figsize=(15, 5))
plt.plot(dates, temp)
plt.title('Topic', fontsize=20)

# In[63]:

from sklearn.model_selection import train_test_split
dates_train, dates_test, label_train, label_test = train_test_split(
    dates, temp, test_size=0.2)

# In[64]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token='x')
tokenizer.fit_on_texts(dates_train)
tokenizer.fit_on_texts(dates_test)

sekuens_latih = tokenizer.texts_to_sequences(dates_train)
sekuens_test = tokenizer.texts_to_sequences(dates_test)

padded_latih = pad_sequences(sekuens_latih)
padded_test = pad_sequences(sekuens_test)

# In[65]:


def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    series = tf.expand_dims(series, axis=-1)
    ds = tf.data.Dataset.from_tensor_slices(series)
''' null인 리뷰 제거하기 '''
train_data = train_data.dropna(how='any')

''' 한글과 공백을 제외하고 모두 제거 '''
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎ ㅏ-ㅣ가-힣 ]", "")

''' 불용어 설정 (불용어의 기준은 주관적임) '''
stop_words = ['다', '의', '가', '이', '은', '들', '는', '.', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']

''' 사전데이터 불러옴 '''+
with open('x_train.json', encoding="utf-8") as f:
    x_train = json.load(f)

''' 토큰화된 텍스트를 정수 인코딩 '''
tokenizer = Tokenizer(num_words=35000)  # 가장 빈도가 높은 35,000개의 단어만 선택하도록 Tokenizer 객체를 만듬
tokenizer.fit_on_texts(x_train)  # 단어 인덱스를 구축
x_train = tokenizer.texts_to_sequences(x_train)  # 문자열을 정수 인덱스의 리스트로 변환

''' 리스트를 (x_train, maxlen) 크기의 2D 정수 텐서로 변환 '''
x_train = pad_sequences(x_train, maxlen=30)

''' 샘플 데이터 label을 y_train에 저장'''
y_train = np.array(train_data['label'])

''' tesorflow info와 warning 메시지를 숨기는 코드 '''
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

''' LSTM 알고리즘으로 학습된 모델 불러오기'''
model = models.load_model('model2.h5')
Exemple #15
0
okt = Okt()
for sentence in train_data['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True)  # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords]  # 불용어 제거
    X_train.append(temp_X)

X_test = []
for sentence in test_data['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True)  # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords]  # 불용어 제거
    X_test.append(temp_X)

##정수 인코딩##
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
# print(tokenizer.word_index)

##데이터에서 희귀 단어 비중 확인##
threshold = 3
total_cnt = len(tokenizer.word_index)  # 단어의 수
rare_cnt = 0  # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0  # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0  # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
        word = values[0]
        vectors = np.asarray(values[1:])
        sysevr_emb_dict[word] = vectors
sysevr_embeddings.close()

print('asts embedding')
print(ast_emb_dict['34'])

print('cg embedding')
print(cg_emb_dict['VAR1'])

print('sysevr embedding')
print(sysevr_emb_dict['const'])

# Tokenize corpus
ast_tokenizer = Tokenizer()

cg_tokenizer = Tokenizer()
bcg_tokenizer = Tokenizer()
fcg_tokenizer = Tokenizer()

sysevr_tokenizer = Tokenizer()

print("tokenizing asts")
# Fit tokenizers
ast_tokenizer.fit_on_texts(ast_data)

print("tokenizing cgs")
bcg_tokenizer.fit_on_texts(back_slices_data)
fcg_tokenizer.fit_on_texts(forward_slices_data)
from tensorflow.keras.preprocessing.text     import Tokenizer
# it allows to use sentences of different lengths and use padding or truncation to
# make all of the sentences the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

# Tokenizer 의 인스턴스 생성. 100은 상당히 큼, 왜냐하면 우리는 단지 5개의 unique 한 단어들을 가지고 있음.
# num_words = 100 it will take the 100 most common words
# OOV = outer vocabulary
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")

tokenizer.fit_on_texts(sentences)
# tokenizer provides a word index property which returns a dictionary containing k-v pairs
word_index = tokenizer.word_index

# turn sentences into a set of sequences for me
# 위에 있는 문장들이 token 들로 변경 된다.
sequences = tokenizer.texts_to_sequences(sentences)

# padding
padded = pad_sequences(sequences, maxlen=5)
# padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)

# padding='post'    -> 0 을 padding 할때 뒤에서 부터 padding
# maxlen=5          -> 0 padding 이 될때는 가장 긴 문장을 기준으로 0 이 padding 된다.
Exemple #18
0
def solution_model():
    url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
    urllib.request.urlretrieve(url, 'sarcasm.json')

    # DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK
    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    training_size = 20000

    sentences = []
    labels = []
    # YOUR CODE HERE
    with open('sarcasm.json', 'r') as f:
        dataset = json.load(f)

    for item in dataset:
        sentences.append(item['headline'])
        labels.append(item['is_sarcastic'])

    token = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    token.fit_on_texts(sentences)
    sentences = token.texts_to_sequences(sentences)
    sentences = pad_sequences(sentences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    # print(token.word_index)
    x_train = np.array(sentences[0:training_size])
    x_test = np.array(sentences[training_size:])
    y_train = np.array(labels[0:training_size])
    y_test = np.array(labels[training_size:])

    from tensorflow.keras.layers import Conv1D, Flatten, Dense, BatchNormalization
    model = tf.keras.Sequential([
    # YOUR CODE HERE. KEEP THIS OUTPUT LAYER INTACT OR TESTS MAY FAIL
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.Conv1D(128, 3),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        tf.keras.layers.Conv1D(64, 5),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        # tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.MaxPool1D(2, 2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.summary()
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
    es = EarlyStopping(patience=8)
    lr = ReduceLROnPlateau(factor=0.25, patience=4, verbose=1)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    model.fit(x_train, y_train, epochs=1000, validation_split=0.2, callbacks=[es, lr])
    print(model.evaluate(x_test, y_test))

    return model
Exemple #19
0
def extract_features(pos_tagged_sentences,
                     feature_detector=ner_features,
                     included_features=['rnn_proba', 'word', 'pos', 'cluster'],
                     included_words=[-2, -1, 0, 1, 2]):
    """
    Transform a list of tagged sentences into a scikit-learn compatible POS dataset
    :param parsed_sentences:
    :param feature_detector:
    :return:
    """
    tokenizer = utils.get_tokenizer()
    sentences = []
    for pos_tagged_sentence in pos_tagged_sentences:
        sentence, pos = zip(*pos_tagged_sentence)
        sentences.append(sentence)
    sentences = [" ".join(words) for words in sentences]

    # GET RNN PROBA
    X_rnn = tokenizer.texts_to_sequences(sentences)
    X_rnn = sequence.pad_sequences(X_rnn,
                                   maxlen=81,
                                   padding='post',
                                   value=Const.PADDING)

    tags = [
        'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
        'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'
    ]
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.utils import to_categorical
    from polyglot.text import Text
    pos_tokenizer = Tokenizer()
    pos_tokenizer.fit_on_texts(tags)

    def read_pos_from_sentences(sentences):
        pos = []
        for sent in sentences:
            plg = Text(sent)
            plg.language = 'id'
            _, plg = zip(*plg.pos_tags)
            pos.append(" ".join(list(plg)))
        pos = pos_tokenizer.texts_to_sequences(pos)
        return pos

    pos_rnn = read_pos_from_sentences(sentences)
    pos_rnn = sequence.pad_sequences(pos_rnn,
                                     maxlen=81,
                                     padding='post',
                                     value=Const.PADDING)
    pos_rnn = to_categorical(pos_rnn)

    # GET CLUSTERS
    list_of_clusters = None
    with open(Const.CLUSTER_ROOT + 'cluster_list_1000.pkl', 'rb') as fi:
        list_of_clusters = dill.load(fi)

    from we.cluster.KMeans import transform
    clusters = transform(X_rnn, list_of_clusters)

    X = []
    K.clear_session()
    ote = RNNOpinionTargetExtractor()
    ote.load_best_model()
    proba = ote.predict([X_rnn, pos_rnn], batch_size=1)

    for i in range(len(pos_tagged_sentences)):
        X_sent = sent2features(pos_tagged_sentences[i],
                               proba[i],
                               clusters[i],
                               feature_detector,
                               included_features=included_features,
                               included_words=included_words)
        X.append(X_sent)

    return X
Exemple #20
0
class NeuralNet1:

    tokenizer = Tokenizer()

    def __init__(self, filename, fetch_data=False):

        self.model = self.init_neuralnet()

        if fetch_data:
            self.X_train, self.Y_train, self.X_test, self.Y_test = self.get_data(
            )

        if filename is not None:
            self.test_inputdata(filename)
        elif filename is None:
            with open(filepath + '/tokenizer.pickle', 'rb') as handle:
                self.tokenizer = pickle.load(handle)
            self.model.load_weights(filepath + '/best_model.h5')
            print("NeuralNet2 Test ready complete")

    def get_data(self):
        train_data = pd.read_table('ratings_train.txt')
        test_data = pd.read_table('ratings_test.txt')

        train_data = NeuralNet1.preprocessing(train_data)
        test_data = NeuralNet1.preprocessing(test_data)

        X_train = NeuralNet1.Token(train_data)
        X_test = NeuralNet1.Token(test_data)

        self.tokenizer.fit_on_texts(X_train)
        with open(filepath + '/tokenizer.pickle', 'wb') as handle:
            pickle.dump(self.tokenizer,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

        # label 데이터 생성
        Y_train = np.array(train_data['label'])
        Y_test = np.array(test_data['label'])

        X_train, Y_train = NeuralNet1.rmEmpty(X_train, Y_train)

        # padding
        X_train = pad_sequences(X_train, maxlen=100, padding='post')
        X_test = pad_sequences(X_test, maxlen=100, padding='post')

        return X_train, Y_train, X_test, Y_test

    def test_inputdata(self, filename):
        # 한 줄에 문장이 하나 있다고 가정,
        # 데이터 형식은 해당 데이터 형식을 따름
        """
        id      document         label
        19238   영화가 재밌네요    1
        1234    재미없어요        0...
        -> 단, predict해야하는 경우 label은 빈칸으로 둠.
        """
        test_data = pd.read_table(filename)
        with open(filepath + '/tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        self.model.load_weights(filepath + '/best_model.h5')
        print("Load tokenizer and model complete, will predict data")

        # print(test_data)
        for i in range(len(test_data)):
            result = self.sentiment_predict(test_data['document'][i])
            test_data.loc[i, ['label']] = result

        test_data.to_csv("result.txt", sep='	', float_format='%.0f')

    def init_neuralnet(self):
        # 하이퍼파라미터는 여기서 조절
        embedding_dim = 256
        hidden_dim = 512
        dropout_rate = 0.6

        # Model
        input = Input(shape=(100, ))
        x = Embedding(2542, embedding_dim)(input)
        x = Dropout(dropout_rate)(x)
        x = Conv1D(hidden_dim, 5, padding="same")(x)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)
        x_res = x
        x = Bidirectional(LSTM(int(hidden_dim / 2), return_sequences=True))(x)
        x = x + x_res
        x = LeakyReLU()(x)
        x = LSTM(hidden_dim, return_sequences=True)(x)
        x = GlobalMaxPool1D()(x)
        x = Dropout(dropout_rate)(x)
        output = Dense(1, activation='sigmoid')(x)

        model = Model(inputs=[input], outputs=output)

        optimizer = Adam(learning_rate=0.001)
        loss_function = BinaryCrossentropy()
        model.compile(optimizer=optimizer, loss=loss_function, metrics=['acc'])
        model.summary()
        return model

    @staticmethod
    def preprocessing(data):
        data = data.drop_duplicates(subset=['document'])
        data = data.dropna(how='any')
        data['document'] = data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")
        data['document'] = data['document'].replace('', np.nan)
        data = data.dropna(how='any')
        return data

    @staticmethod
    def Token(data):
        now = 0
        res = list()

        print("start token")
        for sentence in data['document']:
            if now % 10000 == 0:
                print(f"token : {now}/{len(data)}")
            now = now + 1
            temp = list()
            for i in range(len(sentence)):
                temp.append(sentence[i])
            res.append(temp)
        print("end token")
        return res

    @staticmethod
    def Tokenizing(data, tokenizer):
        return tokenizer.texts_to_sequences(data)

    @staticmethod
    def rmEmpty(data, label):
        drop_data = [
            index for index, sentence in enumerate(data) if len(sentence) < 1
        ]
        data = np.delete(data, drop_data, axis=0)
        label = np.delete(label, drop_data, axis=0)
        return data, label

    def sentiment_predict(self, sentence, justone=False):
        word_list = []
        for letter in sentence:
            word_list.append(letter)
        word_list.insert(0, "<s>")
        word_list.insert(len(word_list), "<e>")
        encoded = self.tokenizer.texts_to_sequences([word_list])
        padding_sentence = pad_sequences(encoded, maxlen=100)
        score = float(self.model.predict(padding_sentence))

        if not justone:
            if score > 0.5:
                return 1
            else:
                return 0
        elif justone:
            return score
Exemple #21
0
print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)


vocab_size = total_cnt - rare_cnt + 1
print('단어 집합의 크기 :',vocab_size)

y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])
'''

max_words = 35000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
print(X_train[:3])
print(X_test[:3])
print(train_data['label'][:40])
#################################################
#################################################
# train_use = pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
# train_use['x_train'] = X_train
# train_use['y_train'] = train_data['label']
#
# train_use.to_csv("./data/train_d.csv", mode='w', index = False, header = False)

# 다음으로는 y값으로 들어갈 label -1, 0, 1을 컴퓨터가 보고 알수 있도록 one-hot encoding
from tensorflow.keras.preprocessing.text import Tokenizer

# Define a Keras Tokenizer
en_tok = Tokenizer()

# Fit the tokenizer on some text
en_tok.fit_on_texts(en_text)

for w in ["january", "apples", "summer"]:
    # Get the word ID of word w
    id = en_tok.word_index[w]
    # Print the word and the word ID
    print(w, " has id: ", id)
def read_data(train_file, VAC_DIR, MAX_SEQUENCE_LENGTH):

    #    MAX_SEQUENCE_LENGTH = 1200  # 每个文本或者句子的截断长度,只保留1000个单词
    MAX_NUM_WORDS = 20000  # 用于构建词向量的词汇表数量
    EMBEDDING_DIM = 100  # 词向量维度
    VALIDATION_SPLIT = 0.3

    # 构建词向量索引
    print("Indexing word vectors.")
    embeddings_index = {}
    with open(VAC_DIR, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]  # 单词
            coefs = np.asarray(values[1:], dtype='float32')  # 单词对应的向量
            embeddings_index[word] = coefs  # 单词及对应的向量

    print('Found %s word vectors.' % len(embeddings_index))  #400000个单词和词向量

    print('预处理文本数据集')
    texts = []  # 训练文本样本的list
    labels = []  # 标签list

    #读取训练数
    data = pd.read_csv(train_file)

    texts = data['Item'].tolist()
    labels = data['Tag'].replace('non-LN', 0).replace('LN', 1).tolist()

    print("Found %s texts %s label_id." %
          (len(texts), len(labels)))  # 19997个文本文件

    # 向量化文本样本
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    # fit_on_text(texts) 使用一系列文档来生成token词典,texts为list类,每个元素为一个文档。就是对文本单词进行去重后
    tokenizer.fit_on_texts(texts)
    # texts_to_sequences(texts) 将多个文档转换为word在词典中索引的向量形式,shape为[len(texts),len(text)] -- (文档数,每条文档的长度)
    sequences = tokenizer.texts_to_sequences(texts)
    print(sequences[0])
    print(len(sequences))  # 19997

    word_index = tokenizer.word_index  # word_index 一个dict,保存所有word对应的编号id,从1开始
    print("Founnd %s unique tokens." % len(word_index))  # 174074个单词
    # ['the', 'to', 'of', 'a', 'and', 'in', 'i', 'is', 'that', "'ax"] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    print(list(word_index.keys())[0:10], list(word_index.values())[0:10])  #

    ######非常好用的函数,直接用
    data = pad_sequences(
        sequences,
        maxlen=MAX_SEQUENCE_LENGTH)  # 长度超过MAX_SEQUENCE_LENGTH则截断,不足则补0

    labels = to_categorical(np.asarray(labels))
    print("训练数据大小为:", data.shape)  # (19997, 1000)
    print("标签大小为:", labels.shape)  # (19997, 20)

    # 将训练数据划分为训练集和验证集
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)  # 打乱数据
    data = data[indices]
    labels = labels[indices]

    num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

    # 训练数据
    x_train = data[:-num_validation_samples]
    y_train = labels[:-num_validation_samples]

    # 验证数据
    x_val = data[-num_validation_samples:]
    y_val = labels[-num_validation_samples:]

    # 准备词向量矩阵
    num_words = min(MAX_NUM_WORDS, len(word_index) + 1)  # 词汇表数量
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))  # 20000*100

    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:  # 过滤掉根据频数排序后排20000以后的词
            continue
        embedding_vector = embeddings_index.get(word)  # 根据词向量字典获取该单词对应的词向量
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return x_train, y_train, x_val, y_val, num_words, embedding_matrix
Exemple #24
0
    def train(self, bucket_name, key):
        dataset = pd.read_csv(self.dataset_location)

        X = []
        sentences = list(dataset['text'])
        for sen in sentences:
            X.append(pre.Preprocess(sen).preprocess_text())

        y = dataset['label']
        encoder = LabelBinarizer()
        y = encoder.fit_transform(y)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.20,
                                                            random_state=42)

        tokenizer = Tokenizer(num_words=10000)
        tokenizer.fit_on_texts(X_train)

        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)

        maxlen = 100

        X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
        X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

        embedding_dict = dict()
        with open(self.embedding_location, encoding='UTF-8') as glove_file:
            for line in glove_file:
                records = line.split()
                word = records[0]
                vector_dimension = np.asarray(records[1:], dtype='float32')
                embedding_dict[word] = vector_dimension

        vocab_size = len(tokenizer.word_index) + 1
        embedding_matrix = np.zeros((vocab_size, 100))

        for word, index in tokenizer.word_index.items():
            embedding_vector = embedding_dict.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector

        model = Sequential([
            Embedding(vocab_size,
                      100,
                      weights=[embedding_matrix],
                      input_length=maxlen,
                      trainable=False),
            Bidirectional(
                LSTM(50,
                     dropout=0.2,
                     recurrent_dropout=0.2,
                     return_sequences=True)),
            Bidirectional(
                LSTM(54,
                     dropout=0.3,
                     recurrent_dropout=0.3,
                     return_sequences=True)),
            Bidirectional(LSTM(60, dropout=0.3, recurrent_dropout=0.3)),
            Dense(64, activation="relu"),
            Dense(7, activation="softmax")
        ])

        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy', Precision(),
                               Recall()])

        history = model.fit(X_train,
                            y_train,
                            batch_size=128,
                            epochs=100,
                            verbose=1,
                            validation_split=0.2)

        with tempfile.TemporaryFile() as fp:
            dump(model, fp)
            fp.seek(0)
            self.s3.Bucket('team07-public').put_object(
                Body=fp.read(),
                Bucket='team08-public',
                Key='model/model_final.model')
            fp.close()

        with tempfile.TemporaryFile() as fp:
            dump(encoder.classes_, fp)
            fp.seek(0)
            self.s3.Bucket('team07-public').put_object(
                Body=fp.read(),
                Bucket='team08-public',
                Key='model/class_names.npy')
            fp.close()

        with tempfile.TemporaryFile() as fp:
            dump(tokenizer, fp)
            fp.seek(0)
            self.s3.Bucket('team07-public').put_object(
                Body=fp.read(),
                Bucket='team08-public',
                Key='model/tokenizer.tokenizer')
            fp.close()
Exemple #25
0
    temp_X = []
    temp_X = kor.morphs(sentence)  # 토큰화
    #stopword에 등록한 조사 제거
    temp_X = [word for word in temp_X if not word in stopwords]
    train_data_document.append(temp_X)

X_input = []
for sentence in df['document']:
    temp_X = []
    temp_X = kor.morphs(sentence)  # 토큰화
    # stopword에 등록한 조사 제거
    temp_X = [word for word in temp_X if not word in stopwords]
    X_input.append(temp_X)

#정수 인코딩과정 텍스트가 1234 같은 정수로 바뀜
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data_document)

# threshold 의 정수값보다 낮게 반복되는 단어를 제거
threshold = 2
total_cnt = len(tokenizer.word_index)  # 단어의 수
rare_cnt = 0  # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0  # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0  # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if (value < threshold):
Exemple #26
0
oov_tok = '<OOV>'
training_portion = 0.8

#Preprocessing
articles = df['text_without_stopwords']
authors = df['author']

train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_authors = authors[0: train_size]

validation_articles = articles[train_size:]
validation_authors = authors[train_size:]

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index
vocab_size=len(word_index)

train_sequences = tokenizer.texts_to_sequences(train_articles)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

author_tokenizer = Tokenizer()
author_tokenizer.fit_on_texts(authors)

training_author_seq = np.array(author_tokenizer.texts_to_sequences(train_authors))
validation_author_seq = np.array(author_tokenizer.texts_to_sequences(validation_authors))
    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)),
                             reverse=True)
    #     print("Indexes of top ranked_sentence order are ", ranked_sentence)

    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
#         print(" ".join(ranked_sentence[i][1]))
    return summarize_text
    # Step 5 - Offcourse, output the summarize texr


#     print("Summarize Text: \n", ". ".join(summarize_text))

x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_tr))

thresh = 4

cnt = 0
tot_cnt = 0
freq = 0
tot_freq = 0

for key, value in x_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    tot_freq = tot_freq + value
    if (value < thresh):
        cnt = cnt + 1
        freq = freq + value
Exemple #28
0
    def train_new_model(self,
                        texts,
                        context_labels=None,
                        num_epochs=50,
                        gen_epochs=1,
                        batch_size=128,
                        dropout=0.0,
                        train_size=1.0,
                        validation=True,
                        save_epochs=0,
                        multi_gpu=False,
                        **kwargs):
        self.config = self.default_config.copy()
        self.config.update(**kwargs)

        print("Training new model w/ {}-layer, {}-cell {}LSTMs".format(
            self.config['rnn_layers'], self.config['rnn_size'],
            'Bidirectional ' if self.config['rnn_bidirectional'] else ''))

        # Create text vocabulary for new texts
        # if word-level, lowercase; if char-level, uppercase
        self.tokenizer = Tokenizer(filters='',
                                   lower=self.config['word_level'],
                                   char_level=(not self.config['word_level']))
        self.tokenizer.fit_on_texts(texts)

        # Limit vocab to max_words
        max_words = self.config['max_words']
        self.tokenizer.word_index = {
            k: v
            for (k, v) in self.tokenizer.word_index.items() if v <= max_words
        }

        if not self.config.get('single_text', False):
            self.tokenizer.word_index[self.META_TOKEN] = len(
                self.tokenizer.word_index) + 1
        self.vocab = self.tokenizer.word_index
        self.num_classes = len(self.vocab) + 1
        self.indices_char = dict((self.vocab[c], c) for c in self.vocab)

        # Create a new, blank model w/ given params
        self.model = textgenrnn_model(self.num_classes,
                                      dropout=dropout,
                                      cfg=self.config)

        # Save the files needed to recreate the model
        with open('{}_vocab.json'.format(self.config['name']),
                  'w',
                  encoding='utf8') as outfile:
            json.dump(self.tokenizer.word_index, outfile, ensure_ascii=False)

        with open('{}_config.json'.format(self.config['name']),
                  'w',
                  encoding='utf8') as outfile:
            json.dump(self.config, outfile, ensure_ascii=False)

        self.train_on_texts(texts,
                            new_model=True,
                            via_new_model=True,
                            context_labels=context_labels,
                            num_epochs=num_epochs,
                            gen_epochs=gen_epochs,
                            train_size=train_size,
                            batch_size=batch_size,
                            dropout=dropout,
                            validation=validation,
                            save_epochs=save_epochs,
                            multi_gpu=multi_gpu,
                            **kwargs)
Exemple #29
0
    config.logger.info(
        "Preprocessed data:\n"
        f"  {original_X[0]} → {X[0]}")

    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test = data.train_val_test_split(
        X=X, y=y, val_size=args.val_size, test_size=args.test_size, shuffle=args.shuffle)
    config.logger.info(
        "Data splits:\n"
        f"\tX_train: {len(X_train)}, y_train: {len(y_train)}\n"
        f"\tX_val: {len(X_val)}, y_val: {len(y_val)}\n"
        f"\tX_test: {len(X_test)}, y_test: {len(y_test)}")

    # Tokenizer
    X_tokenizer = Tokenizer(
        filters=args.filters, lower=args.lower,
        char_level=args.char_level, oov_token='<UNK>')
    X_tokenizer.fit_on_texts(X_train)
    vocab_size = len(X_tokenizer.word_index) + 1  # +1 for padding token
    config.logger.info(f"vocab_size: {vocab_size}")

    # Convert texts to sequences of indices
    original_text = X_train[0]
    X_train = np.array(X_tokenizer.texts_to_sequences(X_train))
    X_val = np.array(X_tokenizer.texts_to_sequences(X_val))
    X_test = np.array(X_tokenizer.texts_to_sequences(X_test))
    preprocessed_text = X_tokenizer.sequences_to_texts([X_train[0]])[0]
    config.logger.info(
        "Text to indices:\n"
        f"  (raw) {original_text}\n"
        f"  (preprocessed) {preprocessed_text}\n"
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog', 'I love my car', 'You love my dog!',
    'Do you think my dog is amazing?'
]
tokenizer = Tokenizer(
    num_words=100,
    oov_token='<oov>')  # 100 most common words, out of vocabulary
tokenizer.fit_on_texts(sentences)  # upper==>lower, takes care of !
word_index = tokenizer.word_index  # makes a dictionary
print(word_index)
sequence = tokenizer.texts_to_sequences(sentences)
print(sequence)
padded = pad_sequences(sequence, padding='post', maxlen=5, truncating='post')
# makes sentences the same size by putting zero where there is no word
print(padded)
test_data = ['I really love my dog', 'My dog loves my manatee']
test_seq = tokenizer.texts_to_sequences(test_data)
test_padded = pad_sequences(test_seq, maxlen=10)
print(test_padded)