Example #1
0
# x_train_clean = pd.read_csv("/gdrive/My Drive/clean_train.csv")
# https://drive.google.com/file/d/1yc2Qy0dZC4Coj9RwiDVENhNTGQSLPXyq/view?usp=sharing

# x_test_clean = pd.read_csv("/gdrive/My Drive/clean_train.csv")
# https://drive.google.com/file/d/1MDGXRl5_OHGDOt1pnBn1RyZFZwF_1udv/view?usp=sharing

# after cleaning
x_train_clean[0]

"""##**2-5 Text to Vector**

Vectorizing the text corpus, by turning each
review text into a sequence of integers where each integer being the index of a token in a dictionary (based on the training set vocabulary list).
"""

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_clean)
print('vocabulary size =', len(tokenizer.word_index))
# vocab_size = len(tokenizer.word_index) + 1

"""Training set has a very large vocabulary size (209,526 unique words in corpus).\
In order to reduce run time, a lower vocabulary size is used for the next steps.


**with # & no lem= 224,754
"""

vocab_size = 10000
tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(x_train_clean)
Example #2
0
#     routes.append(np.sum(np.eye(25)[route.iloc[i, :]], axis=0))
# del route
# gc.collect()
# # %%
# routes = pd.DataFrame(routes, columns=['cmr_' + str(i)
#                                        for i in range(24)] + ['cmr_None'])
# routes = routes.astype(int)
# routes = reduce_mem(routes, use_float16=False)
# routes = pd.concat([datatraintestA, routes], ignore_index=True)
# Cache.cache_data(routes, nm_marker='cmr_stage2_0924')  # cmr 特征onehott
# 以上为train+test_a+test_b的数据形式

# %%

data = dataall
tokenizer = Tokenizer(num_words=24, filters='^')
communication_onlinerate_dict = [
    '0^1^2^3^4^5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23'
]
tokenizer.fit_on_texts(communication_onlinerate_dict)

# %%
communication_onlinerate_raw = data['communication_onlinerate'].tolist()
communication_onlinerate_sequences = tokenizer.texts_to_sequences(
    communication_onlinerate_raw)
communication_onlinerate_sequences = pad_sequences(
    communication_onlinerate_sequences, maxlen=24, padding='post')
communication_onlinerate_onehot = []
# %%
with tqdm(total=communication_onlinerate_sequences.shape[0]) as pbar:
    for i in communication_onlinerate_sequences:
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import numpy as np

tokenizer = Tokenizer()
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \
    -O /tmp/sonnets.txt
data = open('/tmp/sonnets.txt').read()

corpus = data.lower().split("\n")


tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# create input sequences using list of tokens
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
Example #4
0
def token(qdf):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(qdf.question)
    sequences = tokenizer.texts_to_sequences(qdf.question)
    
    return tokenizer, sequences
Example #5
0
sentences = []
labels = []

with open('데이터명.json') as f:
    full_data = json.load(f)
for each_data in full_data:
    sentences.append(each_data['문장을 저장한 key 명칭'])
    labels.append(each_data['레이블을 저장한 key 명칭'])

train_sentences = sentences[:20000]
train_labels = labels[:20000]

validation_sentences = sentences[20000:]
validation_labels = labels[20000:]

tokenizer = Tokenizer(num_words=vocab_size, oov_token='[OOV]')
tokenizer.fit_on_texts(train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)

train_pad = pad_sequences(train_sequences,
                          maxlen=120,
                          truncating='post',
                          padding='post')
validation_pad = pad_sequences(validation_sequences,
                               maxlen=120,
                               truncating='post',
                               padding='post')

train_labels = np.array(train_labels)
Example #6
0
#Rename the columns.............
df.columns = ['labels', 'data']
print(df.head())

#Create binary label............
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
Y = df['b_labels'].values

#Split the data...........
df_train, df_test, Ytrain, Ytest = train_test_split(df['data'],
                                                    Y,
                                                    test_size=0.33)

#Convert the sentences to sequences of words...............
MAX_VOCUB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCUB_SIZE)
tokenizer.fit_on_texts(df_train)
sequences_train = tokenizer.texts_to_sequences(df_train)
sequences_test = tokenizer.texts_to_sequences(df_test)

#word -> integer mapping............
word2idx = tokenizer.word_index
V = len(word2idx)
print(V)

#Padding to get N*T matrix.............
data_train = pad_sequences(sequences_train)
print(data_train.shape)

#Get the sequence length...........
T = data_train.shape[1]
Example #7
0
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)

vocab_size = 10000
embedding_dim = 16
max_length = 32
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<oov>'
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences,
                                maxlen=max_length,
                                padding=padding_type,
                                truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences,
                               maxlen=max_length,
                               padding=padding_type,
                               truncating=trunc_type)

# single_test = np.array(["Sounds like a really useful program."]) 76
# single_test = np.array(["oh you got me!"]) 38
Example #8
0
# -------------------------------------------------------------------------------
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Dense, Add, Flatten, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
import numpy as np
import matplotlib.pyplot as plt

data = [
    "The cat is walking in the bedroom", "A dog was running in a room",
    "The cat is running in a room", "A dog is walking in a bedroom",
    "The dog was walking in the room"
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
word2idx = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(data)

# sequences 뒤에 <EOS>를 추가한다.
word2idx_len = len(word2idx)
word2idx['<EOS>'] = word2idx_len + 1  # end of sentence 추가
idx2word = {v: k for (k, v) in word2idx.items()}
sequences = [s + [word2idx['<EOS>']] for s in sequences]

print(sequences)


def prepare_sentence(seq, maxlen):
    # Pads seq and slides windows
validation_labels = labels[train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

# Expected output (if training_portion=.8)
# 1780
# 1780
# 1780
# 445
# 445

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Bidirectional, GlobalAveragePooling1D, concatenate, LeakyReLU, LSTM
from tensorflow.keras.layers import Dense, Flatten, Dropout, Embedding, Activation, Conv1D, MaxPooling1D, GlobalMaxPooling1D, SpatialDropout1D

df = pd.read_csv('clean_csv_4k.csv')
df = df.fillna('')

with open('test_file.pickle', 'rb') as handle:
    embeddings = pickle.load(handle)

# convert from series to a list
text = df['twitts'].tolist()

y = df['sentiment']

token = Tokenizer()
token.fit_on_texts(text)

vocab_size = len(token.word_index) + 1

encoded_text = token.texts_to_sequences(text)

# Pad the sequences
max_len = max([len(s.split()) for s in text])

X = pad_sequences(encoded_text, maxlen=max_len, padding='post')

# our task is to get the global vectors for our words
# create empty matrix with the proper size
word_vector_matrix = np.zeros((vocab_size, 200))
Example #11
0
def process():
    df = pandas.read_csv('all-data.csv', encoding="latin-1")

    """
    Data text yang sudah ada perlu dibrsihkan agar data latih lebih baik dan bisa 
    digunakan untuk tahapan selanjutnya. Dikarenakan data yang dipakai adalah judul
    berita, maka data sudah sedikit baik dan hanya memerlukan sedikit perubahan 
    salah satunya menggunakan regex.
    """

    def regex (content):
        # menghapus simbol, angka, kata hubung
        content = re.sub(r'[^A-Za-z\s\/]' , ' ', content)
        # menghapus multispace (karena setelah dihapus simbol, angka dan kata hubung
        # terdapat banyak multispace)
        content = re.sub(r'\s\s+', '', content)
        # menghapus multispace dibelakang kalimat
        content = re.sub(r'\s+$', '', content)

        return content

    # mengaplikasi cleansing menggunakan regex  
    cleansing_result = []
    for i in df['News Headline']:
        cleansing = regex(i)
        cleansing_result.append(cleansing)   
    df['News Headline'] = cleansing_result

    # mengubah text pada kolom News Headline menjadi lower case
    df['News Headline'] = df['News Headline'].str.lower()

    # mengganti kata negative, neutral dan positive menjadi angka
    df['Sentiment'] = df['Sentiment'].replace("negative",0).replace("neutral",1).replace("positive",2)

    # ambil data kalimat News Headline, ubah jadi array
    X = df['News Headline'].values

    # ambil Sentiment, ubah jadi array
    Y = df['Sentiment'].values

    # transform column Y ke kategorikal data (sesuai kasus)
    Y = np_utils.to_categorical(Y, num_classes=3)

    # maksimum frequensi pada setiap kata
    MAX_WORD_FREQ = 500000

    # maksimum number pada setiap News Headline
    MAX_WORD_SEQ = 250

    # set embedding layer dimension
    EMBEDDING_DIM = 50

    # proses tokenisasi pada text News Headline
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(df['News Headline'].values)
    word_index = tokenizer.word_index
    # print('%s tokens.' % len(word_index))

    # membuat embedding layer dengan glove 
    embeddings_index = {}
    with open('glove.6B.50d.txt',encoding="utf8") as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    # print("Found %s words." % len(embeddings_index))

    found = 0

    # panjang token plus zero padding
    TOKEN_NUM = len(word_index)+1 

    # mempersiapkan embedding matrix. akan menghasilkan value 0 jika tidak menemukan
    # kata
    embedding_matrix = np.zeros((TOKEN_NUM, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            found += 1

    # print("Found {} words from {} ".format(found,len(word_index)))

    # set input dari model
    X_train = tokenizer.texts_to_sequences(df['News Headline'].values)
    X_train = pad_sequences(X_train, maxlen=MAX_WORD_SEQ)
    # print('Shape of data tensor:', X_train.shape)

    # set Sentiment dari model
    Y_train = pd.get_dummies(df['Sentiment']).values
    # print('Shape of label tensor:', Y_train.shape)

    """
    dikarenakan jumlah kelas yang tidak balance maka dari itu digunakan random 
    over sampler, penerapan random over sampler pada kasus ini lebih baik 
    dibandingkan dengan teknik smote  
    """
    ros = RandomOverSampler(random_state=777)
    X_ROS, y_ROS = ros.fit_sample(X_train, Y_train)

    #split data dengan data test sebanyak 20% dari keseluruhan data
    X_train, x_test, Y_train, y_test = train_test_split(X_ROS,y_ROS,test_size=0.2,random_state=42)

    # embedding layer untuk input LSTM
    embedding_layer = Embedding(TOKEN_NUM, EMBEDDING_DIM, weights=[embedding_matrix], input_length = 250, trainable=False)

    # inisiasi dimensi pada embedding layer
    embedding_dim = 50

    """
    GRU merupakan algoritma Neural Network yang kompleks dan sangat baik dalam pengolahan NLP, 
    algoritma ini lebih cepat dalam melakukan training dibandingkan dengan LSTM namun performanya tetap baik,
    GRU menangani masalah kehilangan informasi akibat data sequential yang teralu panjang yang dapat menurunkan hasil training,
    data train yang digunakan tidak terlalu besar maka dari itu GRU cocok dengan kasus ini.
    """
    # inisiasi model sekuensial
    model = Sequential()
    model.add(embedding_layer)

    # menggunakan model GRU yaitu bagian dari RNN yang lebih kompleks
    model.add(GRU(256, dropout=0.25))

    # inisiasi dense layer
    model.add(Dense(64, activation='relu'))

    # inisiasi dense layer. output sebanyak kelas menggunakan softmax untuk kasus
    # multiclass classification
    model.add(Dense(3, activation='softmax'))

    # Compile model menggunakan optimizer adam dengan loss kategorikal
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Lihat summary dari model
    model.summary()
    # fit model 
    # history = model.fit(X_train, Y_train,epochs=100, validation_split=0.2, batch_size=100)

    # melihat hasil akurasi testing
    # result = model.evaluate(x_test,y_test)

    # menyimpan model
    # model.save_weights("Sentiment_Financial_News.h5") 
    #load model
    model.load_weights('Sentiment_Financial_News_.h5')
    
    # uji model pada data baru dengan proses preprocessing yang serupa dengan 
    # training model

    # con=psycopg2.connect(host = 'localhost', database='final', user='******', password = '******')
    # cur = con.cursor()
    # cur.execute('select * from dataset')
    # rows = cur.fetchall()

    # rows = rows['News Headline'].tolist()

    # cur.close()
    # con.close()

    test = pandas.read_csv('test.csv', encoding='latin-1')

    def regex (content):

        # menghapus simbol, angka, kata hubung
        content = re.sub(r'[^A-Za-z\s\/]' , ' ', content)

        # menghapus multispace
        content = re.sub(r'\s\s+', '', content)
        
        # menghapus multispace dibelakang kalimat
        content = re.sub(r'\s+$', '', content)

        return content

    cleansing_result = []
    for i in test['News Headline']:
        cleansing = regex(i)
        cleansing_result.append(cleansing)   

    test['News Headline'] = cleansing_result
    # mengaplikasi cleansing menggunakan regex  
    test['News Headline'] = test['News Headline'].str.lower()

    # uji testing menggunakan model yang sudah dibuat
    new_data= test["News Headline"]
    seq = tokenizer.texts_to_sequences(new_data)
    padded = pad_sequences(seq, maxlen=250)
    pred = model.predict(padded)
    labels = ["Negative","Neutral","Positive"]
    # print(pred, labels[np.argmax(pred)])
    # looping untuk memprediksi setiap text
    newtest =[]
    for x in pred:
      newtest.append(labels[np.argmax(x)])
      label = pd.DataFrame(data=newtest,columns=['Sentiment'])
    hasil = pd.concat([test,label], axis=1)
    # menyimpan hasil prediksi 
    hasil = hasil.to_csv(os.path.join(app.config['SAVED_FOLDER'], 'hasil.csv'), index=False)

    filename_new = 'hasil.csv'
    # filename_new = predict_model(Sentiment=Sentiment)
    # db.session.add(filename_new)
    # db.session.commit()
    dataTable = csv_convert_result_pre(filename_new)
    # return render_template('process-done.html')
    return render_template('process-done.html', tableTesting = dataTable['Table'],\
        rows = dataTable['Rows'], cols = dataTable['Cols'],\
        filename = filename_new, dnTesting = False)
Example #12
0
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

base_dir='/Users/ashishbansal/PycharmProjects/TensorflowProject/Coursera/'
data_dir=base_dir+'Data/'
# wget --no-check-certificate \
#     https://storage.googleapis.com/laurencemoroney-blog.appspot.com/irish-lyrics-eof.txt \
#     -O /Users/ashishbansal/PycharmProjects/TensorflowProject/Coursera/Data/irish-lyrics-eof.txt

with open(data_dir+'irish-lyrics-eof.txt')as f:
    data=f.read()
#print(data)
corpus=data.lower().split('\n')
#print(corpus)

token=Tokenizer()
token.fit_on_texts(corpus)
total_word=token.word_index
total_word=len(total_word)+1
#print(total_word)

# sequence=token.texts_to_sequences(corpus)
# print(sequence)

input_data=[]
for line in corpus:
    sequence=token.texts_to_sequences([line])[0]
    for i in range(1,len(sequence)):
        se=sequence[0:i+1]
        input_data.append(se)
print(input_data)
# Convolution
filter_length = 3
nb_filters = 128
n_gram = 3
cnn_dropout = 0.0
nb_rnnoutdim = 300
rnn_dropout = 0.0
nb_labels = 1
dense_wl2reg = 0.0
dense_bl2reg = 0.0

texts = data_train

texts = texts.map(lambda x: clean_text(x))

tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(texts)
encoded_train = tokenizer.texts_to_sequences(texts=texts)
vocab_size_train = len(tokenizer.word_index) + 1
print(vocab_size_train)

x_train = sequence.pad_sequences(encoded_train,
                                 maxlen=time_step,
                                 padding='post')

texts = data_rest

texts = texts.map(lambda x: clean_text(x))

encoded_test = tokenizer.texts_to_sequences(texts=texts)
Example #14
0
mail = data.email
mail = mail.astype(str)
label = data.label
le = LabelEncoder()
label = le.fit_transform(label)
label = label.reshape(-1, 1)
X_train, X_test, Y_train, Y_test = train_test_split(mail, label, test_size=0.2)

vocab = set()
for e in mail:
    for w in e.split():
        vocab.add(w)

max_words = len(vocab)  # Vocab max size
max_len = 100  # Sentences padded to 100 words vector
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)

test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len)

# saving tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(max_words, 50, input_length=max_len),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(512, activation='relu'),
Example #15
0
def run():
    df = pd.read_csv(config.INPUT_FILE)

    if config.TRAIN_PROMPT:
        df = df[['prompt', 'essay', config.TRAIN_FOR]]
    else:
        df = df[['essay', config.TRAIN_FOR]]

    df['essay_cleaned'] = df['essay'].apply(utils.replace_label)

    tokenizer = Tokenizer(num_words=config.VOCAB_SIZE)
    if config.TRAIN_PROMPT:
        tokenizer.fit_on_texts(df['prompt'])
    tokenizer.fit_on_texts(df['essay_cleaned'])

    X = utils.preprocess(df['essay_cleaned'], tokenizer, config.MAX_LEN)
    if config.TRAIN_PROMPT:
        X_prompt = utils.preprocess(df['prompt'], tokenizer,
                                    config.MAX_LEN_PROMPT)

    y = df[config.TRAIN_FOR].values

    # Uncomment if getting "DNN implementation Not Found" Error
    # physical_devices = tf.config.list_physical_devices('GPU')
    # tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

    embeddings = utils.load_embedding_matrix(tokenizer, config.GLOVE_PATH)

    if config.TRAIN_PROMPT:
        model = utils.get_model_prompt()
    else:
        model = utils.get_model(embeddings)

    model.compile(loss='mse', optimizer='adam', metrics=['mae'])

    mcp_save = ModelCheckpoint(
        filepath=
        f'../models/model-PROMPT_{config.TRAIN_PROMPT}_{config.TRAIN_FOR}_epochs_{config.EPOCHS}_{datetime.now()}.h5',
        save_best_only=True,
        monitor='val_mae',
        mode='min',
        verbose=1)

    earlyStopping = EarlyStopping(monitor='val_loss',
                                  patience=10,
                                  verbose=1,
                                  mode='min')

    if config.TRAIN_PROMPT:
        history = model.fit([X_prompt, X],
                            y,
                            batch_size=config.BATCH_SIZE,
                            epochs=config.EPOCHS,
                            validation_split=.2,
                            verbose=1,
                            callbacks=[mcp_save, earlyStopping])
    else:
        history = model.fit(X,
                            y,
                            batch_size=config.BATCH_SIZE,
                            epochs=config.EPOCHS,
                            validation_split=.3,
                            verbose=1,
                            shuffle=True,
                            callbacks=[mcp_save, earlyStopping])
    # print(model.summary())
    '''
    For saving pickle model
    with open(f'../models/model-TRAIN_PROMPT-{config.TRAIN_PROMPT}-\
    {config.TRAIN_FOR}-epochs-{config.EPOCHS}-\
    {datetime.now()}.pickle', 'wb') as handle:
        pickle.dump(history.history, handle)

    with open(f'../models/tokenizer_essays.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle)
    '''

    # Saving the model
    if config.TRAIN_PROMPT:
        MODEL_DIR = f"../models/prompt-essay/PROMPT_{config.TRAIN_FOR}"
    else:
        MODEL_DIR = f"../models/{config.TRAIN_FOR}"
    version = "1"
    export_path = os.path.join(MODEL_DIR, version)
    print('export_path = {}\n'.format(export_path))

    tf.keras.models.save_model(model,
                               export_path,
                               overwrite=True,
                               include_optimizer=True,
                               save_format=None,
                               signatures=None,
                               options=None)
Example #16
0
class model_application:
    def __init__(self, newData):
        self.dataform = '.\\Dataset\\dataset_form.csv'
        self.dataset = '.\\Dataset\\prototype_final_shuffle_dataset(50000)_real.csv'
        self.newdata = newData
        self.model_tokenizer = 'tokenizer.word_index_original_new_100000'
        self.model_nlp = '.\\Model\\illegal_nlp_model_new_100000.h5'
        self.model_tokenizer_path = '.\\Model\\' + self.model_tokenizer
        self.max_num_words = 100000
        self.max_len = 2720  # 한 본문 당 길이는 2720으로 맞춘다(padding 할 때 쓴다).
        self.tokenizer = Tokenizer(num_words=self.max_num_words)  # 객체를 먼저 만들고

    def split(self, _input_X_data):
        tmp = []
        for _test_list in _input_X_data[:]:
            _test_list = _test_list.split()
            tmp.append(_test_list)
        return tmp

    # 각각의 본문이 리스트 형태로 바뀌어있고, 검사 후 none_exist에는 존재하지 않는 단어만 들어간다.
    def none_inspection(self, _tmp_X_data, _tokenizer):
        tmp2 = []
        for list in _tmp_X_data:
            tmp = []
            for item in list:
                try:
                    _tokenizer.word_index[item]
                    # 새로 넣은 본문의 단어들이 내가 가진 인덱스에 있는지 확인
                except:
                    tmp.append(item)
            tmp2.append(tmp)
        return tmp2

    def delete_none(self, _tmp_X_data, _none_exist):
        count = 0
        tmp = []
        for list in _tmp_X_data[:]:
            for item in _none_exist[count][:]:
                list.remove(item)
            list = " ".join(list)
            tmp.append(list)
            count = count + 1
        return tmp

    def read_data(self, form):
        if form == 'csv':
            _data = pd.read_csv(self.newdata, encoding='utf-8')
            _data = _data.astype(str)
            return _data
        elif form == 'url':
            _data = pd.read_csv(self.dataform, encoding='utf-8')
            _data = _data.astype(str)
            return _data

    def check_tokenizer(self):
        if not os.path.isfile(self.model_tokenizer_path):
            print(self.model_tokenizer + " DOESN'T exist")
            data = pd.read_csv(self.dataset, encoding='utf-8')
            data = data.astype(str)
            X_data = data['body']
            # y_data = data['classification']

            print(data.isnull().values.any())
            print(data.info)
            self.tokenizer.fit_on_texts(X_data)  # X의 각 행에 토큰화를 수행
            with open(self.model_tokenizer, 'wb') as f:
                print('SAVING TOKENIZER')
                pickle.dump(self.tokenizer, f)
            return self.tokenizer
        else:
            print(self.model_tokenizer + ' EXISTS')
            with open(self.model_tokenizer_path, 'rb') as f:
                print('LOADING TOKENIZER')
                _tokenizer = pickle.load(f)
            return _tokenizer

    def data_processing(self, _data, _tokenizer):
        input_X_data = _data['body']
        converted_input_X_data = input_X_data
        tmp_X_data = self.split(converted_input_X_data)
        none_exist = self.none_inspection(tmp_X_data, _tokenizer)
        tmp_X_data = self.delete_none(tmp_X_data, none_exist)
        for i in range(0, len(input_X_data)):
            input_X_data[i] = tmp_X_data[i]
        sequences = _tokenizer.texts_to_sequences(input_X_data)
        sequences_X_data = sequences
        _final_X_data = pad_sequences(sequences_X_data, maxlen=self.max_len)
        return _final_X_data

    def load_prediction(self, _final_X_data):
        _percentage = []
        model = load_model(self.model_nlp)
        _predictions = model.predict_classes(_final_X_data, verbose=2)
        _probability = model.predict_proba(_final_X_data, verbose=2)
        for number in _probability:
            _percentage.append(float(number) * 100)

        return _predictions, _percentage
Example #17
0
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog', 'I love my cat', 'You love my dog!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)
print("\nWord Index = ", word_index)
print("\nSequences = ", sequences)
print("\nPadded Sequences:")
print(padded)

test_data = ['i really love my dog', 'my dog loves my manatee']

test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")
print(padded)
Example #18
0
        for m in range(len(list2[k])):
            para = list2[k][m]
            sentences = tokenize.sent_tokenize(para)
            for sent in sentences:
                page_sentences.append(sent)
    sent_list.append(page_sentences)


''' This data array contains the word index of every word present in the page.
    and each page is represented as a list of sentences
'''
data = np.zeros((len(df),MAX_SENTS, MAX_SENT_LENGTH), dtype='float32')


# Fit all the scanned data into the tokenizer
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS,oov_token = True)
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index


# Generate the word index of each word using the tokenizer
for i, sentences in enumerate(sent_list):
    for m, sent in enumerate(sentences):
        if m < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for i_ , word in enumerate(wordTokens):
                if(k < MAX_SENT_LENGTH):
                    if word in tokenizer.word_index:
                        if(tokenizer.word_index[word] < MAX_NB_WORDS):
Example #19
0
model.load_weights('./exp')

data = treebank.tagged_sents()
X = []
Y = []
for sents in data:
    token_sequence = []
    tag_sequence = []
    for token in sents:
        token_sequence.append(token[0])
        tag_sequence.append(token[1])
    X.append(token_sequence)
    Y.append(tag_sequence)

# encode X
word_tokenizer = Tokenizer()  # instantiate tokeniser
word_tokenizer.fit_on_texts(X)  # fit tokeniser on data
# use the tokeniser to encode input sequence
X_encoded = word_tokenizer.texts_to_sequences(X)
# encode Y
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(Y)
Y_encoded = tag_tokenizer.texts_to_sequences(Y)

# sequences greater than 100 in length will be truncated
MAX_SEQ_LENGTH = 100
X_padded = pad_sequences(X_encoded,
                         maxlen=MAX_SEQ_LENGTH,
                         padding="pre",
                         truncating="post")
Y_padded = pad_sequences(Y_encoded,
car_components_df = cardata_ds.to_pandas_dataframe()
components = car_components_df["text"].tolist()
labels = car_components_df["label"].tolist()

print("Processing car components data completed.")

#-------------------------------------------------------------------
#
# Use the Tokenizer from Keras to "learn" a vocabulary from the entire car components text
#
#-------------------------------------------------------------------

print("Tokenizing data...")

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(components)
sequences = tokenizer.texts_to_sequences(components)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=embedding_dim)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print("Tokenizing data complete.")

#-------------------------------------------------------------------
#
Example #21
0
# 긍정과 부정을 맞춰보자
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

docs = [
    '너무 재밌어요', '참 최고에요', '참 잘 만든 영화예요', '추천하고 싶은 영화입니다.', '한 번 더 보고 싶네요',
    '글쎄요', '별로에요', '생각보다 지루해요', '연기가 어색해요', '재미없어요', '너무 재미없다', '참 재밌네요',
    '규현이가 잘 생기긴 했어요'
]

# 긍정 1, 부정 0
labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1])

token = Tokenizer()
token.fit_on_texts(docs)
print(token.word_index)
# {'참': 1, '너무': 2, '잘': 3, '재밌어요': 4, '최고에요': 5,
#  '만든': 6, '영화예요': 7, '추천하고': 8, '싶은': 9, '영화입니다': 10,
#  '한': 11, '번': 12, '더': 13, '보고': 14, '싶네요': 15, '글쎄요': 16,
#  '별로에요': 17, '생각보다': 18, '지루해요': 19, '연기가': 20, '어색해요': 21,
#  '재미없어요': 22, '재미없다': 23, '재밌네요': 24, '규현이가': 25, '생기긴': 26, '했어요': 27}

# 문장의 수치화
x = token.texts_to_sequences(docs)
print(x)
# [[2, 4], [1, 5], [1, 3, 6, 7], [8, 9, 10], [11, 12, 13, 14, 15], [16],
#  [17], [18, 19], [20, 21], [22], [2, 23], [1, 24], [25, 3, 26, 27]]

# 문제점: 문장의 길이가 각각 다름
# 해결책: 긴 문장 기준으로 짧은 문장은 0을 채워줌
texts_test = [''.join(clean_text(text)) for text in X_test]

texts

data["Text"][92]

texts_train[92]





# Using Tensorflow Keras Tokenizer to : Create a Internal Vocabulary -- Words to Integers ; 
#Arrange Sentences in Integers sequence formats

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

sequence_train = tokenizer.texts_to_sequences(texts_train)
sequence_test = tokenizer.texts_to_sequences(texts_test)

index_of_words = tokenizer.word_index

# vacab size is number of unique words + reserved 0 index for padding
vocab_size = len(index_of_words) + 1

print('Number of unique words: {}'.format(len(index_of_words)))



sequence_train,sequence_test
Example #23
0
def tokenizer_data(list_tweets):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list_tweets)
    sequences = tokenizer.texts_to_sequences(list_tweets)
    vocab_size = len(tokenizer.word_index)
    return sequences, tokenizer
Example #24
0
OOV_TOK = "<OOV>" #Out Of Vocabulary Handling
TRAIN_SIZE = 15542

##Data Loading and Preprocessing
Train = pd.read_csv("train.csv")

Train = Train.dropna()
Train = Train.copy()
Train.reset_index(inplace = True)
x = Train['title']
y = Train['label']
x = np.array(x)
y = np.array(y)
train_sentences, test_sentences, train_labels, test_labels = model_selection.train_test_split(x, y, test_size = 0.15, random_state=101)
    
train_sentences = np.array(train_sentences)
test_sentences = np.array(test_sentences)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOK)
tokenizer.fit_on_texts(train_sentences)
wordIndex = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

Example #25
0
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
nltk.download('punkt', quiet=True)
from nltk.tokenize import word_tokenize
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) +
                     r')\b\s*')
tokenizer = Tokenizer()
from tensorflow.keras.layers import Embedding
'''
About the task:
You are provided with a codeflow- which consists of functions to be implemented(MANDATORY).
You need to implement each of the functions mentioned below, you may add your own function parameters if needed(not to main).
Execute your code using the provided auto.py script(NO EDITS PERMITTED) as your code will be evaluated using an auto-grader.
'''


def embedding(vocab_size, word_index):
    embeddings_index = {}
    with open('/content/drive/MyDrive/CS772/glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
Example #26
0
    "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when",
    "when's", "where", "where's", "which", "while", "who", "who's", "whom",
    "why", "why's", "with", "would", "you", "you'd", "you'll", "you're",
    "you've", "your", "yours", "yourself", "yourselves"
]

sentences = []
labels = []

with open(data_file, "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=",")
    next(reader)
    for row in reader:
        labels.append(row[0])
        # sentence = row[1]
        sentence = " ".join([w for w in row[1].split() if w not in stopwords])
        sentences.append(sentence)

# print(len(sentences))
# print(sentences[0])

# Tokenize sentences
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(f"Length of word_index: {len(word_index)}")

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(f"Padded shape: {padded.shape}")
Example #27
0
File: main.py Project: gurkan08/nlp
    def run_preprocess(data):
        # preprocess
        data = Main.do_preprocess(data)

        # split train-test
        X_train, X_test, y_train, y_test = train_test_split(
            data["text"],
            data["label"],
            test_size=Params.test_size,
            random_state=42,
            stratify=data["label"])

        # max sentence size
        Main.find_max_sentence_size(pd.DataFrame(X_train, columns=["text"]))
        #print("mean sentence size --> ", Params.max_sent_size)

        # train data
        train_df = pd.DataFrame(zip(X_train, y_train),
                                columns=["text", "label"])
        Main.sentence_tokenizer = Tokenizer(
            oov_token="UNK",
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
            lower=True)  # 0 index reserved as padding_value
        Main.sentence_tokenizer.fit_on_texts(train_df["text"])
        train_sentences = Main.sentence_tokenizer.texts_to_sequences(
            train_df["text"])  # list
        train_sentences = pad_sequences(train_sentences,
                                        maxlen=Params.max_sent_size,
                                        padding="post",
                                        value=0.)
        with open(os.path.join(Params.model_dir, "sentence_tokenizer.pickle"),
                  "wb") as handle:
            pickle.dump(Main.sentence_tokenizer,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

        Main.label_tokenizer = Tokenizer(
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
        Main.label_tokenizer.fit_on_texts(train_df["label"])
        train_labels = Main.label_tokenizer.texts_to_sequences(
            train_df["label"])  # list
        train_labels = np.array(train_labels)
        train_labels = [
            to_categorical(i - 1,
                           num_classes=len(Main.label_tokenizer.word_index))
            for i in train_labels
        ]
        train_labels = np.array(train_labels)
        train_labels = train_labels.reshape(
            (train_labels.shape[0],
             train_labels.shape[-1]))  # [n_samples, n_labels]
        with open(os.path.join(Params.model_dir, "label_tokenizer.pickle"),
                  "wb") as handle:
            pickle.dump(Main.label_tokenizer,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

        # test data
        test_df = pd.DataFrame(zip(X_test, y_test), columns=["text", "label"])
        test_sentences = Main.sentence_tokenizer.texts_to_sequences(
            test_df["text"])  # list
        test_sentences = pad_sequences(test_sentences,
                                       maxlen=Params.max_sent_size,
                                       padding="post",
                                       value=0.)

        test_labels = Main.label_tokenizer.texts_to_sequences(test_df["label"])
        test_labels = np.array(test_labels)
        test_labels = [
            to_categorical(i - 1,
                           num_classes=len(Main.label_tokenizer.word_index))
            for i in test_labels
        ]  # list
        test_labels = np.array(test_labels)
        test_labels = test_labels.reshape(
            (test_labels.shape[0],
             test_labels.shape[-1]))  # [n_samples, n_labels]

        # fasttext embedding init
        #Main.fasttext_embedding_init()

        return train_sentences, train_labels, test_sentences, test_labels
    sent = sent.replace('.', '')
    sent = sent.replace('?', '')
    sent = sent.replace('/', '')
    sent = sent.replace(':', '')
    sent = sent.replace(';', '')

    return sent


# Tokenizer

oov_token = "<OOV>"
max_length = 20
num_topic_words = 4

tokenizer = Tokenizer(oov_token=oov_token)

with open(f"{path_to_respgen}/bin/Tokens.txt", 'r') as file:
    js_string = file.read()
    tokenizer = tokenizer_from_json(js_string)
word_index = tokenizer.word_index
word_index['startsent'] = 0
word_index['endsent'] = len(word_index) + 1
index_word = {word_index[word]: word for word in word_index}
vocab_size = len(word_index) + 1


def preprocess_sent(text_list):
    inputs = []
    for sent in text_list:
        inputs.append(remove_char(sent))
Example #29
0
    for row in data:
        train_sentences.append(row[2])
        labels.append(row[1])
        id.append(row[0])
labels = np.array(labels)

#hyperparameter
vocab_size = 10000
embedding_size = 64
max_length = 40
trunc_type = "post"
oov_tok = "<OOV>"
num_epochs = 3

#prepare training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
sequences = tokenizer.texts_to_sequences(train_sentences)
padded_sequences = pad_sequences(sequences,
                                 maxlen=max_length,
                                 truncating=trunc_type)
word_index = tokenizer.word_index

#model building
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,
                              embedding_size,
                              input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
Example #30
0
teks = df_baru['Teks'].values
label = df_baru[['penipuan', 'promo', 'sms']].values
"""**Training and Validation Spliting** <br>
dengan 20% data test. sudah menggunakan fungsi tokenizer untuk menggabungkan data teks.
"""

from sklearn.model_selection import train_test_split

teks_latih, teks_test, label_latih, label_test = train_test_split(
    teks, label, test_size=0.2)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token='x')
tokenizer.fit_on_texts(teks_latih)
tokenizer.fit_on_texts(teks_test)

sekuens_latih = tokenizer.texts_to_sequences(teks_latih)
sekuens_test = tokenizer.texts_to_sequences(teks_test)

padded_latih = pad_sequences(sekuens_latih)
padded_test = pad_sequences(sekuens_test)
"""Terdapat 3 kelas kategorikal, dilihat dari shape."""

print(label.shape)
"""**Pembuatan Model Layer** <br>
menggunakan embedding dan LSTM. relu dibuat agar fully connected antar layer
sementara dropout untuk mengurangi overfitting.
"""