# x_train_clean = pd.read_csv("/gdrive/My Drive/clean_train.csv") # https://drive.google.com/file/d/1yc2Qy0dZC4Coj9RwiDVENhNTGQSLPXyq/view?usp=sharing # x_test_clean = pd.read_csv("/gdrive/My Drive/clean_train.csv") # https://drive.google.com/file/d/1MDGXRl5_OHGDOt1pnBn1RyZFZwF_1udv/view?usp=sharing # after cleaning x_train_clean[0] """##**2-5 Text to Vector** Vectorizing the text corpus, by turning each review text into a sequence of integers where each integer being the index of a token in a dictionary (based on the training set vocabulary list). """ tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train_clean) print('vocabulary size =', len(tokenizer.word_index)) # vocab_size = len(tokenizer.word_index) + 1 """Training set has a very large vocabulary size (209,526 unique words in corpus).\ In order to reduce run time, a lower vocabulary size is used for the next steps. **with # & no lem= 224,754 """ vocab_size = 10000 tokenizer = Tokenizer(vocab_size) tokenizer.fit_on_texts(x_train_clean)
# routes.append(np.sum(np.eye(25)[route.iloc[i, :]], axis=0)) # del route # gc.collect() # # %% # routes = pd.DataFrame(routes, columns=['cmr_' + str(i) # for i in range(24)] + ['cmr_None']) # routes = routes.astype(int) # routes = reduce_mem(routes, use_float16=False) # routes = pd.concat([datatraintestA, routes], ignore_index=True) # Cache.cache_data(routes, nm_marker='cmr_stage2_0924') # cmr 特征onehott # 以上为train+test_a+test_b的数据形式 # %% data = dataall tokenizer = Tokenizer(num_words=24, filters='^') communication_onlinerate_dict = [ '0^1^2^3^4^5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23' ] tokenizer.fit_on_texts(communication_onlinerate_dict) # %% communication_onlinerate_raw = data['communication_onlinerate'].tolist() communication_onlinerate_sequences = tokenizer.texts_to_sequences( communication_onlinerate_raw) communication_onlinerate_sequences = pad_sequences( communication_onlinerate_sequences, maxlen=24, padding='post') communication_onlinerate_onehot = [] # %% with tqdm(total=communication_onlinerate_sequences.shape[0]) as pbar: for i in communication_onlinerate_sequences:
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import Adam from tensorflow.keras import regularizers import tensorflow.keras.utils as ku import numpy as np tokenizer = Tokenizer() !wget --no-check-certificate \ https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \ -O /tmp/sonnets.txt data = open('/tmp/sonnets.txt').read() corpus = data.lower().split("\n") tokenizer.fit_on_texts(corpus) total_words = len(tokenizer.word_index) + 1 # create input sequences using list of tokens input_sequences = [] for line in corpus: token_list = tokenizer.texts_to_sequences([line])[0]
def token(qdf): tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(qdf.question) sequences = tokenizer.texts_to_sequences(qdf.question) return tokenizer, sequences
sentences = [] labels = [] with open('데이터명.json') as f: full_data = json.load(f) for each_data in full_data: sentences.append(each_data['문장을 저장한 key 명칭']) labels.append(each_data['레이블을 저장한 key 명칭']) train_sentences = sentences[:20000] train_labels = labels[:20000] validation_sentences = sentences[20000:] validation_labels = labels[20000:] tokenizer = Tokenizer(num_words=vocab_size, oov_token='[OOV]') tokenizer.fit_on_texts(train_sentences) train_sequences = tokenizer.texts_to_sequences(train_sentences) validation_sequences = tokenizer.texts_to_sequences(validation_sentences) train_pad = pad_sequences(train_sequences, maxlen=120, truncating='post', padding='post') validation_pad = pad_sequences(validation_sequences, maxlen=120, truncating='post', padding='post') train_labels = np.array(train_labels)
#Rename the columns............. df.columns = ['labels', 'data'] print(df.head()) #Create binary label............ df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1}) Y = df['b_labels'].values #Split the data........... df_train, df_test, Ytrain, Ytest = train_test_split(df['data'], Y, test_size=0.33) #Convert the sentences to sequences of words............... MAX_VOCUB_SIZE = 20000 tokenizer = Tokenizer(num_words=MAX_VOCUB_SIZE) tokenizer.fit_on_texts(df_train) sequences_train = tokenizer.texts_to_sequences(df_train) sequences_test = tokenizer.texts_to_sequences(df_test) #word -> integer mapping............ word2idx = tokenizer.word_index V = len(word2idx) print(V) #Padding to get N*T matrix............. data_train = pad_sequences(sequences_train) print(data_train.shape) #Get the sequence length........... T = data_train.shape[1]
y = df['label'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) vocab_size = 10000 embedding_dim = 16 max_length = 32 trunc_type = 'post' padding_type = 'post' oov_tok = '<oov>' from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(X_train) word_index = tokenizer.word_index training_sequences = tokenizer.texts_to_sequences(X_train) training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) testing_sequences = tokenizer.texts_to_sequences(X_test) testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # single_test = np.array(["Sounds like a really useful program."]) 76 # single_test = np.array(["oh you got me!"]) 38
# ------------------------------------------------------------------------------- from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Input, Embedding, Dense, Add, Flatten, LSTM from tensorflow.keras.models import Model from tensorflow.keras import optimizers import numpy as np import matplotlib.pyplot as plt data = [ "The cat is walking in the bedroom", "A dog was running in a room", "The cat is running in a room", "A dog is walking in a bedroom", "The dog was walking in the room" ] tokenizer = Tokenizer() tokenizer.fit_on_texts(data) word2idx = tokenizer.word_index sequences = tokenizer.texts_to_sequences(data) # sequences 뒤에 <EOS>를 추가한다. word2idx_len = len(word2idx) word2idx['<EOS>'] = word2idx_len + 1 # end of sentence 추가 idx2word = {v: k for (k, v) in word2idx.items()} sequences = [s + [word2idx['<EOS>']] for s in sequences] print(sequences) def prepare_sentence(seq, maxlen): # Pads seq and slides windows
validation_labels = labels[train_size:] print(train_size) print(len(train_sentences)) print(len(train_labels)) print(len(validation_sentences)) print(len(validation_labels)) # Expected output (if training_portion=.8) # 1780 # 1780 # 1780 # 445 # 445 tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(train_sentences) word_index = tokenizer.word_index train_sequences = tokenizer.texts_to_sequences(train_sentences) train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length) print(len(train_sequences[0])) print(len(train_padded[0])) print(len(train_sequences[1])) print(len(train_padded[1])) print(len(train_sequences[10])) print(len(train_padded[10]))
from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Input, Bidirectional, GlobalAveragePooling1D, concatenate, LeakyReLU, LSTM from tensorflow.keras.layers import Dense, Flatten, Dropout, Embedding, Activation, Conv1D, MaxPooling1D, GlobalMaxPooling1D, SpatialDropout1D df = pd.read_csv('clean_csv_4k.csv') df = df.fillna('') with open('test_file.pickle', 'rb') as handle: embeddings = pickle.load(handle) # convert from series to a list text = df['twitts'].tolist() y = df['sentiment'] token = Tokenizer() token.fit_on_texts(text) vocab_size = len(token.word_index) + 1 encoded_text = token.texts_to_sequences(text) # Pad the sequences max_len = max([len(s.split()) for s in text]) X = pad_sequences(encoded_text, maxlen=max_len, padding='post') # our task is to get the global vectors for our words # create empty matrix with the proper size word_vector_matrix = np.zeros((vocab_size, 200))
def process(): df = pandas.read_csv('all-data.csv', encoding="latin-1") """ Data text yang sudah ada perlu dibrsihkan agar data latih lebih baik dan bisa digunakan untuk tahapan selanjutnya. Dikarenakan data yang dipakai adalah judul berita, maka data sudah sedikit baik dan hanya memerlukan sedikit perubahan salah satunya menggunakan regex. """ def regex (content): # menghapus simbol, angka, kata hubung content = re.sub(r'[^A-Za-z\s\/]' , ' ', content) # menghapus multispace (karena setelah dihapus simbol, angka dan kata hubung # terdapat banyak multispace) content = re.sub(r'\s\s+', '', content) # menghapus multispace dibelakang kalimat content = re.sub(r'\s+$', '', content) return content # mengaplikasi cleansing menggunakan regex cleansing_result = [] for i in df['News Headline']: cleansing = regex(i) cleansing_result.append(cleansing) df['News Headline'] = cleansing_result # mengubah text pada kolom News Headline menjadi lower case df['News Headline'] = df['News Headline'].str.lower() # mengganti kata negative, neutral dan positive menjadi angka df['Sentiment'] = df['Sentiment'].replace("negative",0).replace("neutral",1).replace("positive",2) # ambil data kalimat News Headline, ubah jadi array X = df['News Headline'].values # ambil Sentiment, ubah jadi array Y = df['Sentiment'].values # transform column Y ke kategorikal data (sesuai kasus) Y = np_utils.to_categorical(Y, num_classes=3) # maksimum frequensi pada setiap kata MAX_WORD_FREQ = 500000 # maksimum number pada setiap News Headline MAX_WORD_SEQ = 250 # set embedding layer dimension EMBEDDING_DIM = 50 # proses tokenisasi pada text News Headline tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(df['News Headline'].values) word_index = tokenizer.word_index # print('%s tokens.' % len(word_index)) # membuat embedding layer dengan glove embeddings_index = {} with open('glove.6B.50d.txt',encoding="utf8") as f: for line in f: word, coefs = line.split(maxsplit=1) coefs = np.fromstring(coefs, "f", sep=" ") embeddings_index[word] = coefs # print("Found %s words." % len(embeddings_index)) found = 0 # panjang token plus zero padding TOKEN_NUM = len(word_index)+1 # mempersiapkan embedding matrix. akan menghasilkan value 0 jika tidak menemukan # kata embedding_matrix = np.zeros((TOKEN_NUM, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector found += 1 # print("Found {} words from {} ".format(found,len(word_index))) # set input dari model X_train = tokenizer.texts_to_sequences(df['News Headline'].values) X_train = pad_sequences(X_train, maxlen=MAX_WORD_SEQ) # print('Shape of data tensor:', X_train.shape) # set Sentiment dari model Y_train = pd.get_dummies(df['Sentiment']).values # print('Shape of label tensor:', Y_train.shape) """ dikarenakan jumlah kelas yang tidak balance maka dari itu digunakan random over sampler, penerapan random over sampler pada kasus ini lebih baik dibandingkan dengan teknik smote """ ros = RandomOverSampler(random_state=777) X_ROS, y_ROS = ros.fit_sample(X_train, Y_train) #split data dengan data test sebanyak 20% dari keseluruhan data X_train, x_test, Y_train, y_test = train_test_split(X_ROS,y_ROS,test_size=0.2,random_state=42) # embedding layer untuk input LSTM embedding_layer = Embedding(TOKEN_NUM, EMBEDDING_DIM, weights=[embedding_matrix], input_length = 250, trainable=False) # inisiasi dimensi pada embedding layer embedding_dim = 50 """ GRU merupakan algoritma Neural Network yang kompleks dan sangat baik dalam pengolahan NLP, algoritma ini lebih cepat dalam melakukan training dibandingkan dengan LSTM namun performanya tetap baik, GRU menangani masalah kehilangan informasi akibat data sequential yang teralu panjang yang dapat menurunkan hasil training, data train yang digunakan tidak terlalu besar maka dari itu GRU cocok dengan kasus ini. """ # inisiasi model sekuensial model = Sequential() model.add(embedding_layer) # menggunakan model GRU yaitu bagian dari RNN yang lebih kompleks model.add(GRU(256, dropout=0.25)) # inisiasi dense layer model.add(Dense(64, activation='relu')) # inisiasi dense layer. output sebanyak kelas menggunakan softmax untuk kasus # multiclass classification model.add(Dense(3, activation='softmax')) # Compile model menggunakan optimizer adam dengan loss kategorikal model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # Lihat summary dari model model.summary() # fit model # history = model.fit(X_train, Y_train,epochs=100, validation_split=0.2, batch_size=100) # melihat hasil akurasi testing # result = model.evaluate(x_test,y_test) # menyimpan model # model.save_weights("Sentiment_Financial_News.h5") #load model model.load_weights('Sentiment_Financial_News_.h5') # uji model pada data baru dengan proses preprocessing yang serupa dengan # training model # con=psycopg2.connect(host = 'localhost', database='final', user='******', password = '******') # cur = con.cursor() # cur.execute('select * from dataset') # rows = cur.fetchall() # rows = rows['News Headline'].tolist() # cur.close() # con.close() test = pandas.read_csv('test.csv', encoding='latin-1') def regex (content): # menghapus simbol, angka, kata hubung content = re.sub(r'[^A-Za-z\s\/]' , ' ', content) # menghapus multispace content = re.sub(r'\s\s+', '', content) # menghapus multispace dibelakang kalimat content = re.sub(r'\s+$', '', content) return content cleansing_result = [] for i in test['News Headline']: cleansing = regex(i) cleansing_result.append(cleansing) test['News Headline'] = cleansing_result # mengaplikasi cleansing menggunakan regex test['News Headline'] = test['News Headline'].str.lower() # uji testing menggunakan model yang sudah dibuat new_data= test["News Headline"] seq = tokenizer.texts_to_sequences(new_data) padded = pad_sequences(seq, maxlen=250) pred = model.predict(padded) labels = ["Negative","Neutral","Positive"] # print(pred, labels[np.argmax(pred)]) # looping untuk memprediksi setiap text newtest =[] for x in pred: newtest.append(labels[np.argmax(x)]) label = pd.DataFrame(data=newtest,columns=['Sentiment']) hasil = pd.concat([test,label], axis=1) # menyimpan hasil prediksi hasil = hasil.to_csv(os.path.join(app.config['SAVED_FOLDER'], 'hasil.csv'), index=False) filename_new = 'hasil.csv' # filename_new = predict_model(Sentiment=Sentiment) # db.session.add(filename_new) # db.session.commit() dataTable = csv_convert_result_pre(filename_new) # return render_template('process-done.html') return render_template('process-done.html', tableTesting = dataTable['Table'],\ rows = dataTable['Rows'], cols = dataTable['Cols'],\ filename = filename_new, dnTesting = False)
from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences base_dir='/Users/ashishbansal/PycharmProjects/TensorflowProject/Coursera/' data_dir=base_dir+'Data/' # wget --no-check-certificate \ # https://storage.googleapis.com/laurencemoroney-blog.appspot.com/irish-lyrics-eof.txt \ # -O /Users/ashishbansal/PycharmProjects/TensorflowProject/Coursera/Data/irish-lyrics-eof.txt with open(data_dir+'irish-lyrics-eof.txt')as f: data=f.read() #print(data) corpus=data.lower().split('\n') #print(corpus) token=Tokenizer() token.fit_on_texts(corpus) total_word=token.word_index total_word=len(total_word)+1 #print(total_word) # sequence=token.texts_to_sequences(corpus) # print(sequence) input_data=[] for line in corpus: sequence=token.texts_to_sequences([line])[0] for i in range(1,len(sequence)): se=sequence[0:i+1] input_data.append(se) print(input_data)
# Convolution filter_length = 3 nb_filters = 128 n_gram = 3 cnn_dropout = 0.0 nb_rnnoutdim = 300 rnn_dropout = 0.0 nb_labels = 1 dense_wl2reg = 0.0 dense_bl2reg = 0.0 texts = data_train texts = texts.map(lambda x: clean_text(x)) tokenizer = Tokenizer(num_words=vocabulary_size) tokenizer.fit_on_texts(texts) encoded_train = tokenizer.texts_to_sequences(texts=texts) vocab_size_train = len(tokenizer.word_index) + 1 print(vocab_size_train) x_train = sequence.pad_sequences(encoded_train, maxlen=time_step, padding='post') texts = data_rest texts = texts.map(lambda x: clean_text(x)) encoded_test = tokenizer.texts_to_sequences(texts=texts)
mail = data.email mail = mail.astype(str) label = data.label le = LabelEncoder() label = le.fit_transform(label) label = label.reshape(-1, 1) X_train, X_test, Y_train, Y_test = train_test_split(mail, label, test_size=0.2) vocab = set() for e in mail: for w in e.split(): vocab.add(w) max_words = len(vocab) # Vocab max size max_len = 100 # Sentences padded to 100 words vector tok = Tokenizer(num_words=max_words) tok.fit_on_texts(X_train) sequences = tok.texts_to_sequences(X_train) sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len) test_sequences = tok.texts_to_sequences(X_test) test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len) # saving tokenizer with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL) model = tf.keras.models.Sequential([ tf.keras.layers.Embedding(max_words, 50, input_length=max_len), tf.keras.layers.LSTM(64), tf.keras.layers.Dense(512, activation='relu'),
def run(): df = pd.read_csv(config.INPUT_FILE) if config.TRAIN_PROMPT: df = df[['prompt', 'essay', config.TRAIN_FOR]] else: df = df[['essay', config.TRAIN_FOR]] df['essay_cleaned'] = df['essay'].apply(utils.replace_label) tokenizer = Tokenizer(num_words=config.VOCAB_SIZE) if config.TRAIN_PROMPT: tokenizer.fit_on_texts(df['prompt']) tokenizer.fit_on_texts(df['essay_cleaned']) X = utils.preprocess(df['essay_cleaned'], tokenizer, config.MAX_LEN) if config.TRAIN_PROMPT: X_prompt = utils.preprocess(df['prompt'], tokenizer, config.MAX_LEN_PROMPT) y = df[config.TRAIN_FOR].values # Uncomment if getting "DNN implementation Not Found" Error # physical_devices = tf.config.list_physical_devices('GPU') # tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) embeddings = utils.load_embedding_matrix(tokenizer, config.GLOVE_PATH) if config.TRAIN_PROMPT: model = utils.get_model_prompt() else: model = utils.get_model(embeddings) model.compile(loss='mse', optimizer='adam', metrics=['mae']) mcp_save = ModelCheckpoint( filepath= f'../models/model-PROMPT_{config.TRAIN_PROMPT}_{config.TRAIN_FOR}_epochs_{config.EPOCHS}_{datetime.now()}.h5', save_best_only=True, monitor='val_mae', mode='min', verbose=1) earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min') if config.TRAIN_PROMPT: history = model.fit([X_prompt, X], y, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, validation_split=.2, verbose=1, callbacks=[mcp_save, earlyStopping]) else: history = model.fit(X, y, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, validation_split=.3, verbose=1, shuffle=True, callbacks=[mcp_save, earlyStopping]) # print(model.summary()) ''' For saving pickle model with open(f'../models/model-TRAIN_PROMPT-{config.TRAIN_PROMPT}-\ {config.TRAIN_FOR}-epochs-{config.EPOCHS}-\ {datetime.now()}.pickle', 'wb') as handle: pickle.dump(history.history, handle) with open(f'../models/tokenizer_essays.pickle', 'wb') as handle: pickle.dump(tokenizer, handle) ''' # Saving the model if config.TRAIN_PROMPT: MODEL_DIR = f"../models/prompt-essay/PROMPT_{config.TRAIN_FOR}" else: MODEL_DIR = f"../models/{config.TRAIN_FOR}" version = "1" export_path = os.path.join(MODEL_DIR, version) print('export_path = {}\n'.format(export_path)) tf.keras.models.save_model(model, export_path, overwrite=True, include_optimizer=True, save_format=None, signatures=None, options=None)
class model_application: def __init__(self, newData): self.dataform = '.\\Dataset\\dataset_form.csv' self.dataset = '.\\Dataset\\prototype_final_shuffle_dataset(50000)_real.csv' self.newdata = newData self.model_tokenizer = 'tokenizer.word_index_original_new_100000' self.model_nlp = '.\\Model\\illegal_nlp_model_new_100000.h5' self.model_tokenizer_path = '.\\Model\\' + self.model_tokenizer self.max_num_words = 100000 self.max_len = 2720 # 한 본문 당 길이는 2720으로 맞춘다(padding 할 때 쓴다). self.tokenizer = Tokenizer(num_words=self.max_num_words) # 객체를 먼저 만들고 def split(self, _input_X_data): tmp = [] for _test_list in _input_X_data[:]: _test_list = _test_list.split() tmp.append(_test_list) return tmp # 각각의 본문이 리스트 형태로 바뀌어있고, 검사 후 none_exist에는 존재하지 않는 단어만 들어간다. def none_inspection(self, _tmp_X_data, _tokenizer): tmp2 = [] for list in _tmp_X_data: tmp = [] for item in list: try: _tokenizer.word_index[item] # 새로 넣은 본문의 단어들이 내가 가진 인덱스에 있는지 확인 except: tmp.append(item) tmp2.append(tmp) return tmp2 def delete_none(self, _tmp_X_data, _none_exist): count = 0 tmp = [] for list in _tmp_X_data[:]: for item in _none_exist[count][:]: list.remove(item) list = " ".join(list) tmp.append(list) count = count + 1 return tmp def read_data(self, form): if form == 'csv': _data = pd.read_csv(self.newdata, encoding='utf-8') _data = _data.astype(str) return _data elif form == 'url': _data = pd.read_csv(self.dataform, encoding='utf-8') _data = _data.astype(str) return _data def check_tokenizer(self): if not os.path.isfile(self.model_tokenizer_path): print(self.model_tokenizer + " DOESN'T exist") data = pd.read_csv(self.dataset, encoding='utf-8') data = data.astype(str) X_data = data['body'] # y_data = data['classification'] print(data.isnull().values.any()) print(data.info) self.tokenizer.fit_on_texts(X_data) # X의 각 행에 토큰화를 수행 with open(self.model_tokenizer, 'wb') as f: print('SAVING TOKENIZER') pickle.dump(self.tokenizer, f) return self.tokenizer else: print(self.model_tokenizer + ' EXISTS') with open(self.model_tokenizer_path, 'rb') as f: print('LOADING TOKENIZER') _tokenizer = pickle.load(f) return _tokenizer def data_processing(self, _data, _tokenizer): input_X_data = _data['body'] converted_input_X_data = input_X_data tmp_X_data = self.split(converted_input_X_data) none_exist = self.none_inspection(tmp_X_data, _tokenizer) tmp_X_data = self.delete_none(tmp_X_data, none_exist) for i in range(0, len(input_X_data)): input_X_data[i] = tmp_X_data[i] sequences = _tokenizer.texts_to_sequences(input_X_data) sequences_X_data = sequences _final_X_data = pad_sequences(sequences_X_data, maxlen=self.max_len) return _final_X_data def load_prediction(self, _final_X_data): _percentage = [] model = load_model(self.model_nlp) _predictions = model.predict_classes(_final_X_data, verbose=2) _probability = model.predict_proba(_final_X_data, verbose=2) for number in _probability: _percentage.append(float(number) * 100) return _predictions, _percentage
import tensorflow as tf from tensorflow import keras from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences sentences = [ 'I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?' ] tokenizer = Tokenizer(num_words=100, oov_token="<OOV>") tokenizer.fit_on_texts(sentences) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(sentences) padded = pad_sequences(sequences, maxlen=5) print("\nWord Index = ", word_index) print("\nSequences = ", sequences) print("\nPadded Sequences:") print(padded) test_data = ['i really love my dog', 'my dog loves my manatee'] test_seq = tokenizer.texts_to_sequences(test_data) print("\nTest Sequence = ", test_seq) padded = pad_sequences(test_seq, maxlen=10) print("\nPadded Test Sequence: ") print(padded)
for m in range(len(list2[k])): para = list2[k][m] sentences = tokenize.sent_tokenize(para) for sent in sentences: page_sentences.append(sent) sent_list.append(page_sentences) ''' This data array contains the word index of every word present in the page. and each page is represented as a list of sentences ''' data = np.zeros((len(df),MAX_SENTS, MAX_SENT_LENGTH), dtype='float32') # Fit all the scanned data into the tokenizer tokenizer = Tokenizer(nb_words=MAX_NB_WORDS,oov_token = True) tokenizer.fit_on_texts(texts) word_index = tokenizer.word_index # Generate the word index of each word using the tokenizer for i, sentences in enumerate(sent_list): for m, sent in enumerate(sentences): if m < MAX_SENTS: wordTokens = text_to_word_sequence(sent) k = 0 for i_ , word in enumerate(wordTokens): if(k < MAX_SENT_LENGTH): if word in tokenizer.word_index: if(tokenizer.word_index[word] < MAX_NB_WORDS):
model.load_weights('./exp') data = treebank.tagged_sents() X = [] Y = [] for sents in data: token_sequence = [] tag_sequence = [] for token in sents: token_sequence.append(token[0]) tag_sequence.append(token[1]) X.append(token_sequence) Y.append(tag_sequence) # encode X word_tokenizer = Tokenizer() # instantiate tokeniser word_tokenizer.fit_on_texts(X) # fit tokeniser on data # use the tokeniser to encode input sequence X_encoded = word_tokenizer.texts_to_sequences(X) # encode Y tag_tokenizer = Tokenizer() tag_tokenizer.fit_on_texts(Y) Y_encoded = tag_tokenizer.texts_to_sequences(Y) # sequences greater than 100 in length will be truncated MAX_SEQ_LENGTH = 100 X_padded = pad_sequences(X_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post") Y_padded = pad_sequences(Y_encoded,
car_components_df = cardata_ds.to_pandas_dataframe() components = car_components_df["text"].tolist() labels = car_components_df["label"].tolist() print("Processing car components data completed.") #------------------------------------------------------------------- # # Use the Tokenizer from Keras to "learn" a vocabulary from the entire car components text # #------------------------------------------------------------------- print("Tokenizing data...") tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(components) sequences = tokenizer.texts_to_sequences(components) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=embedding_dim) labels = np.asarray(labels) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) print("Tokenizing data complete.") #------------------------------------------------------------------- #
# 긍정과 부정을 맞춰보자 from tensorflow.keras.preprocessing.text import Tokenizer import numpy as np docs = [ '너무 재밌어요', '참 최고에요', '참 잘 만든 영화예요', '추천하고 싶은 영화입니다.', '한 번 더 보고 싶네요', '글쎄요', '별로에요', '생각보다 지루해요', '연기가 어색해요', '재미없어요', '너무 재미없다', '참 재밌네요', '규현이가 잘 생기긴 했어요' ] # 긍정 1, 부정 0 labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1]) token = Tokenizer() token.fit_on_texts(docs) print(token.word_index) # {'참': 1, '너무': 2, '잘': 3, '재밌어요': 4, '최고에요': 5, # '만든': 6, '영화예요': 7, '추천하고': 8, '싶은': 9, '영화입니다': 10, # '한': 11, '번': 12, '더': 13, '보고': 14, '싶네요': 15, '글쎄요': 16, # '별로에요': 17, '생각보다': 18, '지루해요': 19, '연기가': 20, '어색해요': 21, # '재미없어요': 22, '재미없다': 23, '재밌네요': 24, '규현이가': 25, '생기긴': 26, '했어요': 27} # 문장의 수치화 x = token.texts_to_sequences(docs) print(x) # [[2, 4], [1, 5], [1, 3, 6, 7], [8, 9, 10], [11, 12, 13, 14, 15], [16], # [17], [18, 19], [20, 21], [22], [2, 23], [1, 24], [25, 3, 26, 27]] # 문제점: 문장의 길이가 각각 다름 # 해결책: 긴 문장 기준으로 짧은 문장은 0을 채워줌
texts_test = [''.join(clean_text(text)) for text in X_test] texts data["Text"][92] texts_train[92] # Using Tensorflow Keras Tokenizer to : Create a Internal Vocabulary -- Words to Integers ; #Arrange Sentences in Integers sequence formats tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequence_train = tokenizer.texts_to_sequences(texts_train) sequence_test = tokenizer.texts_to_sequences(texts_test) index_of_words = tokenizer.word_index # vacab size is number of unique words + reserved 0 index for padding vocab_size = len(index_of_words) + 1 print('Number of unique words: {}'.format(len(index_of_words))) sequence_train,sequence_test
def tokenizer_data(list_tweets): tokenizer = Tokenizer() tokenizer.fit_on_texts(list_tweets) sequences = tokenizer.texts_to_sequences(list_tweets) vocab_size = len(tokenizer.word_index) return sequences, tokenizer
OOV_TOK = "<OOV>" #Out Of Vocabulary Handling TRAIN_SIZE = 15542 ##Data Loading and Preprocessing Train = pd.read_csv("train.csv") Train = Train.dropna() Train = Train.copy() Train.reset_index(inplace = True) x = Train['title'] y = Train['label'] x = np.array(x) y = np.array(y) train_sentences, test_sentences, train_labels, test_labels = model_selection.train_test_split(x, y, test_size = 0.15, random_state=101) train_sentences = np.array(train_sentences) test_sentences = np.array(test_sentences) train_labels = np.array(train_labels) test_labels = np.array(test_labels) tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOK) tokenizer.fit_on_texts(train_sentences) wordIndex = tokenizer.word_index train_sequences = tokenizer.texts_to_sequences(train_sentences) train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE) test_sequences = tokenizer.texts_to_sequences(test_sentences) test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE)
from sklearn import metrics import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import re import nltk nltk.download('punkt', quiet=True) from nltk.tokenize import word_tokenize nltk.download('stopwords', quiet=True) from nltk.corpus import stopwords pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') tokenizer = Tokenizer() from tensorflow.keras.layers import Embedding ''' About the task: You are provided with a codeflow- which consists of functions to be implemented(MANDATORY). You need to implement each of the functions mentioned below, you may add your own function parameters if needed(not to main). Execute your code using the provided auto.py script(NO EDITS PERMITTED) as your code will be evaluated using an auto-grader. ''' def embedding(vocab_size, word_index): embeddings_index = {} with open('/content/drive/MyDrive/CS772/glove.6B.100d.txt') as f: for line in f: values = line.split() word = values[0]
"we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ] sentences = [] labels = [] with open(data_file, "r") as csvfile: reader = csv.reader(csvfile, delimiter=",") next(reader) for row in reader: labels.append(row[0]) # sentence = row[1] sentence = " ".join([w for w in row[1].split() if w not in stopwords]) sentences.append(sentence) # print(len(sentences)) # print(sentences[0]) # Tokenize sentences tokenizer = Tokenizer(oov_token="<OOV>") tokenizer.fit_on_texts(sentences) word_index = tokenizer.word_index print(f"Length of word_index: {len(word_index)}") sequences = tokenizer.texts_to_sequences(sentences) padded = pad_sequences(sequences, padding='post') print(f"Padded shape: {padded.shape}")
def run_preprocess(data): # preprocess data = Main.do_preprocess(data) # split train-test X_train, X_test, y_train, y_test = train_test_split( data["text"], data["label"], test_size=Params.test_size, random_state=42, stratify=data["label"]) # max sentence size Main.find_max_sentence_size(pd.DataFrame(X_train, columns=["text"])) #print("mean sentence size --> ", Params.max_sent_size) # train data train_df = pd.DataFrame(zip(X_train, y_train), columns=["text", "label"]) Main.sentence_tokenizer = Tokenizer( oov_token="UNK", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True) # 0 index reserved as padding_value Main.sentence_tokenizer.fit_on_texts(train_df["text"]) train_sentences = Main.sentence_tokenizer.texts_to_sequences( train_df["text"]) # list train_sentences = pad_sequences(train_sentences, maxlen=Params.max_sent_size, padding="post", value=0.) with open(os.path.join(Params.model_dir, "sentence_tokenizer.pickle"), "wb") as handle: pickle.dump(Main.sentence_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) Main.label_tokenizer = Tokenizer( filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True) Main.label_tokenizer.fit_on_texts(train_df["label"]) train_labels = Main.label_tokenizer.texts_to_sequences( train_df["label"]) # list train_labels = np.array(train_labels) train_labels = [ to_categorical(i - 1, num_classes=len(Main.label_tokenizer.word_index)) for i in train_labels ] train_labels = np.array(train_labels) train_labels = train_labels.reshape( (train_labels.shape[0], train_labels.shape[-1])) # [n_samples, n_labels] with open(os.path.join(Params.model_dir, "label_tokenizer.pickle"), "wb") as handle: pickle.dump(Main.label_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # test data test_df = pd.DataFrame(zip(X_test, y_test), columns=["text", "label"]) test_sentences = Main.sentence_tokenizer.texts_to_sequences( test_df["text"]) # list test_sentences = pad_sequences(test_sentences, maxlen=Params.max_sent_size, padding="post", value=0.) test_labels = Main.label_tokenizer.texts_to_sequences(test_df["label"]) test_labels = np.array(test_labels) test_labels = [ to_categorical(i - 1, num_classes=len(Main.label_tokenizer.word_index)) for i in test_labels ] # list test_labels = np.array(test_labels) test_labels = test_labels.reshape( (test_labels.shape[0], test_labels.shape[-1])) # [n_samples, n_labels] # fasttext embedding init #Main.fasttext_embedding_init() return train_sentences, train_labels, test_sentences, test_labels
sent = sent.replace('.', '') sent = sent.replace('?', '') sent = sent.replace('/', '') sent = sent.replace(':', '') sent = sent.replace(';', '') return sent # Tokenizer oov_token = "<OOV>" max_length = 20 num_topic_words = 4 tokenizer = Tokenizer(oov_token=oov_token) with open(f"{path_to_respgen}/bin/Tokens.txt", 'r') as file: js_string = file.read() tokenizer = tokenizer_from_json(js_string) word_index = tokenizer.word_index word_index['startsent'] = 0 word_index['endsent'] = len(word_index) + 1 index_word = {word_index[word]: word for word in word_index} vocab_size = len(word_index) + 1 def preprocess_sent(text_list): inputs = [] for sent in text_list: inputs.append(remove_char(sent))
for row in data: train_sentences.append(row[2]) labels.append(row[1]) id.append(row[0]) labels = np.array(labels) #hyperparameter vocab_size = 10000 embedding_size = 64 max_length = 40 trunc_type = "post" oov_tok = "<OOV>" num_epochs = 3 #prepare training data tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(train_sentences) sequences = tokenizer.texts_to_sequences(train_sentences) padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) word_index = tokenizer.word_index #model building model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_length), tf.keras.layers.Flatten(), tf.keras.layers.Dense(16, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid'),
teks = df_baru['Teks'].values label = df_baru[['penipuan', 'promo', 'sms']].values """**Training and Validation Spliting** <br> dengan 20% data test. sudah menggunakan fungsi tokenizer untuk menggabungkan data teks. """ from sklearn.model_selection import train_test_split teks_latih, teks_test, label_latih, label_test = train_test_split( teks, label, test_size=0.2) from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences tokenizer = Tokenizer(num_words=5000, oov_token='x') tokenizer.fit_on_texts(teks_latih) tokenizer.fit_on_texts(teks_test) sekuens_latih = tokenizer.texts_to_sequences(teks_latih) sekuens_test = tokenizer.texts_to_sequences(teks_test) padded_latih = pad_sequences(sekuens_latih) padded_test = pad_sequences(sekuens_test) """Terdapat 3 kelas kategorikal, dilihat dari shape.""" print(label.shape) """**Pembuatan Model Layer** <br> menggunakan embedding dan LSTM. relu dibuat agar fully connected antar layer sementara dropout untuk mengurangi overfitting. """