# Plotting The Word Cloud For Text That Is Fake #plt.figure(figsize = (20,20)) #wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df[df.isfake == 0].clean_joined)) #plt.imshow(wc, interpolation = 'bilinear') #plt.show() # Splitting Data Into Test And Train from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(df.clean_joined, df.isfake, test_size = 0.2) from nltk import word_tokenize # Creating A Tokenizer To Tokenize The Words And Create Sequences Of Tokenized Words tokenizer = Tokenizer(num_words = total_words) tokenizer.fit_on_texts(x_train) train_sequences = tokenizer.texts_to_sequences(x_train) test_sequences = tokenizer.texts_to_sequences(x_test) # Adding Padding padded_train = pad_sequences(train_sequences,maxlen = 40, padding = 'post', truncating = 'post') padded_test = pad_sequences(test_sequences,maxlen = 40, truncating = 'post') # Sequential Model model = Sequential() # Embeddidng layer model.add(Embedding(total_words, output_dim = 128))
from tensorflow.keras.preprocessing.text import Tokenizer import numpy as np docs = [ 'so funny', 'very nice', 'well mad movie', 'suggest this movie', 'want to see again', 'dumdum' 'boring movie', 'acting bad', 'not fun', 'boring', 'too boring', 'very funny', 'he is handsome' ] # Positive 1, Negative 0 labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1]) token = Tokenizer() token.fit_on_texts(docs) print(token.word_index) x = token.texts_to_sequences(docs) print(x) from tensorflow.keras.preprocessing.sequence import pad_sequences pad_x = pad_sequences(x, padding='pre', maxlen=5) # post print(pad_x) print(np.unique(pad_x)) print(len(np.unique(pad_x))) #2. Model from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, Conv1D
num_words = 5000 # Максимальная длина новости max_news_len = 50 # Количество классов новостей nb_classes = 5 # Получение тренировочных данных train = pd.read_csv('data/train_en.csv', header=None, names=['text', 'star']) reviews = train['text'] y_train = utils.to_categorical(train['star'] - 1, nb_classes) for i in reviews: if type(i) is float: print(i) # Создаем токенизатор tokenizer = Tokenizer(num_words=num_words) # Обучаем токенизатор tokenizer.fit_on_texts(reviews) # Используем на наших данных sequences = tokenizer.texts_to_sequences(reviews) x_train = pad_sequences(sequences, maxlen=max_news_len) # Сохраняем токенизатор with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # Создание нейронной сети model_lstm = Sequential() model_lstm.add(Embedding(num_words, 100, input_length=max_news_len)) model_lstm.add(SpatialDropout1D(0.1)) model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.1))
line = line.replace("(", "") line = line.replace(")", "") line = line.replace("/", "") line = line.replace("\\", "") line = line.replace("&", "") line = line.replace("#", "") line = re.sub('\d', '', line) line = line.split(' ') line = [w for w in line if not w in stop_words] line = str(line) line = str(line.strip())[1:-1].replace(' ', ' ') strings.append(line) #encode text as numbers tok_Len = 100000 # max number of words for tokenizer tokenizer = Tokenizer(num_words=tok_Len) tokenizer.fit_on_texts(strings) sequences = tokenizer.texts_to_sequences(strings) term_Index = tokenizer.word_index print('Number of Terms:', len(term_Index)) sen_Len = 98 # max length of each sentences, including padding tok_Features = pad_sequences(sequences, padding='post', maxlen=sen_Len) print('Shape of tokenized features tensor:', tok_Features.shape) indices = np.arange(tok_Features.shape[0]) np.random.shuffle(indices) time_series = df['created_at_retweets'] time_series.reset_index(drop=True, inplace=True) time_series = time_series[indices]
print('\nDoing label encoding') # Here it'll make some labels in the form of numbers label_encoder = LabelEncoder() label_encoder.fit(training_labels) training_labels = label_encoder.transform(training_labels) print('\nDone label encoding') vocab_size = 1000 embedding_dim = 16 max_len = 20 oov_token = "<OOV>" print('\nDoing tokenization') # Here we're doing the tokenization and have # some sequences in the sentences taken from users tokenizer = Tokenizer() tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(training_sentences) padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len) print('\nDone with tokenization') # Creating the neural network model print("\nBuilding the network") model = Sequential() model.add(Embedding(vocab_size, embedding_dim, input_length=max_len)) model.add(GlobalAveragePooling1D()) model.add(Dense(16, activation='relu', name='Hidden_Layer_1')) model.add(Dense(16, activation='relu', name='Hidden_Layer_2')) model.add(Dense(num_classes, activation='softmax', name='Output_Layer'))
def __init__(self, num_words=None): self.num_words = num_words if num_words != None: self.tokenizer = Tokenizer(num_words=self.num_words) else: self.tokenizer = Tokenizer()
train_sentences = sentences[:train_size] train_labels = labels[:train_size] validation_sentences = sentences[train_size:] validation_labels = labels[train_size:] print(train_size) print(len(train_sentences)) print(len(train_labels)) print(len(validation_sentences)) print(len(validation_labels)) # In[9]: tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(train_sentences) word_index = tokenizer.word_index train_sequences = tokenizer.texts_to_sequences(train_sentences) train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length) print(len(train_sequences[0])) print(len(train_padded[0])) print(len(train_sequences[1])) print(len(train_padded[1])) print(len(train_sequences[10]))
url = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json' #path_to_json = tf.keras.utils.get_file('sarcasm.json', origin=url) #PATH = os.path.join(os.path.dirname(path_to_json), 'sarcasm') sarcasm=wget.download(url) with open("sarcasm.json", 'r') as f: datastore = json.load(f) sentences = [] labels = [] urls = [] for item in datastore: sentences.append(item['headline']) labels.append(item['is_sarcastic']) urls.append(item['article_link']) tokenizer = Tokenizer(oov_token="<OOV>") tokenizer.fit_on_texts(sentences) word_index = tokenizer.word_index print(len(word_index)) print(word_index) sequences = tokenizer.texts_to_sequences(sentences) padded = pad_sequences(sequences, padding='post') print(padded[0]) print(padded.shape)
import tensorflow as tf from tensorflow import keras from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences sentences = [ 'Hey! how are you doing?', 'I am doing great! What about you', 'Well! I had some stuff to do but now I am free, thanks for asking' ] tokenizer = Tokenizer(num_words=100, oov_token='<OOV>') tokenizer.fit_on_texts(sentences) word = tokenizer.word_index sentence_sequences = tokenizer.texts_to_sequences(sentences) test_sequence = [ 'Hey, everyone how are you doing ?', 'Everyone: We are good, lets just party', ] test_data_sequences = tokenizer.texts_to_sequences(test_sequence) pad_seq = pad_sequences(sequences) pad_test_seq = pad_sequences(test_data_sequences) print(word) print(sentence_sequences) print(f'Test sequences: \n {test_data_sequences}') print(f'Padded sequences for sentences:\n {pad_seq}') print(f'Padded sequecnes for test data:\n {pad_test_seq}')
from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences sentences = [ 'i love my dog', 'i love my cat', 'you love dog!', 'Do you think my dog is amazing?' ] tokenizer = Tokenizer(num_words=100, oov_token="<RAJ> ") tokenizer.fit_on_texts(sentences) word_index = tokenizer.word_index #create sequences sequences = tokenizer.texts_to_sequences(sentences) #print(word_index) #print(sequences) test_data = ['i really love my dog', 'my dog loves my menatee'] test_seq = tokenizer.texts_to_sequences(test_data) #print(test_seq) #padding: post ,pre and truncating option : maxlen padded = pad_sequences(sequences, padding='post', maxlen=8) print(word_index) print(sentences)
tk = loaded_tokenizer.texts_to_sequences([t1]) for i in range(len(tk)): for j in range(22 - len(tk[i])): tk[i].insert(0, 0) tk = np.array(tk) return tk def predict(loaded_model, tk, le_loaded): pred = loaded_model.predict(tk) return le_loaded.inverse_transform([np.argmax(pred[0])]) is_first_time = False model = tf.keras.Model() tokenizer = Tokenizer(num_words=5000, split=" ") le = preprocessing.LabelEncoder() @app.route("/api", methods=["POST", "GET"]) def login(): global is_first_time global model global tokenizer global le tweet = request.get_json()["tweet"] if (is_first_time == False): is_first_time = True model = load_model() tokenizer = load_tokenizer() le = load_label_encoder()
import tensorflow as tf from tensorflow import keras from tensorflow.keras.preprocessing.text import Tokenizer sent = [ 'This is sentence one', 'This is sentence two', 'That is sentence! three', 'Do you believe this sentence is amazing?' ] tokenizer = Tokenizer(num_words=100) tokenizer.fit_on_texts(sent) word_index = tokenizer.word_index initial_text_to_sequence = tokenizer.texts_to_sequences(sent) print(word_index) print(initial_text_to_sequence)
plt.figure(figsize=(15, 5)) plt.plot(dates, temp) plt.title('Topic', fontsize=20) # In[63]: from sklearn.model_selection import train_test_split dates_train, dates_test, label_train, label_test = train_test_split( dates, temp, test_size=0.2) # In[64]: from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences tokenizer = Tokenizer(num_words=5000, oov_token='x') tokenizer.fit_on_texts(dates_train) tokenizer.fit_on_texts(dates_test) sekuens_latih = tokenizer.texts_to_sequences(dates_train) sekuens_test = tokenizer.texts_to_sequences(dates_test) padded_latih = pad_sequences(sekuens_latih) padded_test = pad_sequences(sekuens_test) # In[65]: def windowed_dataset(series, window_size, batch_size, shuffle_buffer): series = tf.expand_dims(series, axis=-1) ds = tf.data.Dataset.from_tensor_slices(series)
''' null인 리뷰 제거하기 ''' train_data = train_data.dropna(how='any') ''' 한글과 공백을 제외하고 모두 제거 ''' train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎ ㅏ-ㅣ가-힣 ]", "") ''' 불용어 설정 (불용어의 기준은 주관적임) ''' stop_words = ['다', '의', '가', '이', '은', '들', '는', '.', '과', '도', '를', '으로', '자', '에', '와', '한', '하다'] ''' 사전데이터 불러옴 '''+ with open('x_train.json', encoding="utf-8") as f: x_train = json.load(f) ''' 토큰화된 텍스트를 정수 인코딩 ''' tokenizer = Tokenizer(num_words=35000) # 가장 빈도가 높은 35,000개의 단어만 선택하도록 Tokenizer 객체를 만듬 tokenizer.fit_on_texts(x_train) # 단어 인덱스를 구축 x_train = tokenizer.texts_to_sequences(x_train) # 문자열을 정수 인덱스의 리스트로 변환 ''' 리스트를 (x_train, maxlen) 크기의 2D 정수 텐서로 변환 ''' x_train = pad_sequences(x_train, maxlen=30) ''' 샘플 데이터 label을 y_train에 저장''' y_train = np.array(train_data['label']) ''' tesorflow info와 warning 메시지를 숨기는 코드 ''' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' ''' LSTM 알고리즘으로 학습된 모델 불러오기''' model = models.load_model('model2.h5')
okt = Okt() for sentence in train_data['document']: temp_X = [] temp_X = okt.morphs(sentence, stem=True) # 토큰화 temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거 X_train.append(temp_X) X_test = [] for sentence in test_data['document']: temp_X = [] temp_X = okt.morphs(sentence, stem=True) # 토큰화 temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거 X_test.append(temp_X) ##정수 인코딩## tokenizer = Tokenizer() tokenizer.fit_on_texts(X_train) # print(tokenizer.word_index) ##데이터에서 희귀 단어 비중 확인## threshold = 3 total_cnt = len(tokenizer.word_index) # 단어의 수 rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트 total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합 rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합 # 단어와 빈도수의 쌍(pair)을 key와 value로 받는다. for key, value in tokenizer.word_counts.items(): total_freq = total_freq + value # 단어의 등장 빈도수가 threshold보다 작으면
word = values[0] vectors = np.asarray(values[1:]) sysevr_emb_dict[word] = vectors sysevr_embeddings.close() print('asts embedding') print(ast_emb_dict['34']) print('cg embedding') print(cg_emb_dict['VAR1']) print('sysevr embedding') print(sysevr_emb_dict['const']) # Tokenize corpus ast_tokenizer = Tokenizer() cg_tokenizer = Tokenizer() bcg_tokenizer = Tokenizer() fcg_tokenizer = Tokenizer() sysevr_tokenizer = Tokenizer() print("tokenizing asts") # Fit tokenizers ast_tokenizer.fit_on_texts(ast_data) print("tokenizing cgs") bcg_tokenizer.fit_on_texts(back_slices_data) fcg_tokenizer.fit_on_texts(forward_slices_data)
from tensorflow.keras.preprocessing.text import Tokenizer # it allows to use sentences of different lengths and use padding or truncation to # make all of the sentences the same length from tensorflow.keras.preprocessing.sequence import pad_sequences sentences = [ 'I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?' ] # Tokenizer 의 인스턴스 생성. 100은 상당히 큼, 왜냐하면 우리는 단지 5개의 unique 한 단어들을 가지고 있음. # num_words = 100 it will take the 100 most common words # OOV = outer vocabulary tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>") tokenizer.fit_on_texts(sentences) # tokenizer provides a word index property which returns a dictionary containing k-v pairs word_index = tokenizer.word_index # turn sentences into a set of sequences for me # 위에 있는 문장들이 token 들로 변경 된다. sequences = tokenizer.texts_to_sequences(sentences) # padding padded = pad_sequences(sequences, maxlen=5) # padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5) # padding='post' -> 0 을 padding 할때 뒤에서 부터 padding # maxlen=5 -> 0 padding 이 될때는 가장 긴 문장을 기준으로 0 이 padding 된다.
def solution_model(): url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json' urllib.request.urlretrieve(url, 'sarcasm.json') # DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK vocab_size = 1000 embedding_dim = 16 max_length = 120 trunc_type='post' padding_type='post' oov_tok = "<OOV>" training_size = 20000 sentences = [] labels = [] # YOUR CODE HERE with open('sarcasm.json', 'r') as f: dataset = json.load(f) for item in dataset: sentences.append(item['headline']) labels.append(item['is_sarcastic']) token = Tokenizer(num_words=vocab_size, oov_token=oov_tok) token.fit_on_texts(sentences) sentences = token.texts_to_sequences(sentences) sentences = pad_sequences(sentences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # print(token.word_index) x_train = np.array(sentences[0:training_size]) x_test = np.array(sentences[training_size:]) y_train = np.array(labels[0:training_size]) y_test = np.array(labels[training_size:]) from tensorflow.keras.layers import Conv1D, Flatten, Dense, BatchNormalization model = tf.keras.Sequential([ # YOUR CODE HERE. KEEP THIS OUTPUT LAYER INTACT OR TESTS MAY FAIL tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), tf.keras.layers.Conv1D(128, 3), tf.keras.layers.BatchNormalization(), tf.keras.layers.Activation('relu'), tf.keras.layers.Conv1D(64, 5), tf.keras.layers.BatchNormalization(), tf.keras.layers.Activation('relu'), # tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.MaxPool1D(2, 2), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.summary() from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau es = EarlyStopping(patience=8) lr = ReduceLROnPlateau(factor=0.25, patience=4, verbose=1) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc']) model.fit(x_train, y_train, epochs=1000, validation_split=0.2, callbacks=[es, lr]) print(model.evaluate(x_test, y_test)) return model
def extract_features(pos_tagged_sentences, feature_detector=ner_features, included_features=['rnn_proba', 'word', 'pos', 'cluster'], included_words=[-2, -1, 0, 1, 2]): """ Transform a list of tagged sentences into a scikit-learn compatible POS dataset :param parsed_sentences: :param feature_detector: :return: """ tokenizer = utils.get_tokenizer() sentences = [] for pos_tagged_sentence in pos_tagged_sentences: sentence, pos = zip(*pos_tagged_sentence) sentences.append(sentence) sentences = [" ".join(words) for words in sentences] # GET RNN PROBA X_rnn = tokenizer.texts_to_sequences(sentences) X_rnn = sequence.pad_sequences(X_rnn, maxlen=81, padding='post', value=Const.PADDING) tags = [ 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X' ] from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.utils import to_categorical from polyglot.text import Text pos_tokenizer = Tokenizer() pos_tokenizer.fit_on_texts(tags) def read_pos_from_sentences(sentences): pos = [] for sent in sentences: plg = Text(sent) plg.language = 'id' _, plg = zip(*plg.pos_tags) pos.append(" ".join(list(plg))) pos = pos_tokenizer.texts_to_sequences(pos) return pos pos_rnn = read_pos_from_sentences(sentences) pos_rnn = sequence.pad_sequences(pos_rnn, maxlen=81, padding='post', value=Const.PADDING) pos_rnn = to_categorical(pos_rnn) # GET CLUSTERS list_of_clusters = None with open(Const.CLUSTER_ROOT + 'cluster_list_1000.pkl', 'rb') as fi: list_of_clusters = dill.load(fi) from we.cluster.KMeans import transform clusters = transform(X_rnn, list_of_clusters) X = [] K.clear_session() ote = RNNOpinionTargetExtractor() ote.load_best_model() proba = ote.predict([X_rnn, pos_rnn], batch_size=1) for i in range(len(pos_tagged_sentences)): X_sent = sent2features(pos_tagged_sentences[i], proba[i], clusters[i], feature_detector, included_features=included_features, included_words=included_words) X.append(X_sent) return X
class NeuralNet1: tokenizer = Tokenizer() def __init__(self, filename, fetch_data=False): self.model = self.init_neuralnet() if fetch_data: self.X_train, self.Y_train, self.X_test, self.Y_test = self.get_data( ) if filename is not None: self.test_inputdata(filename) elif filename is None: with open(filepath + '/tokenizer.pickle', 'rb') as handle: self.tokenizer = pickle.load(handle) self.model.load_weights(filepath + '/best_model.h5') print("NeuralNet2 Test ready complete") def get_data(self): train_data = pd.read_table('ratings_train.txt') test_data = pd.read_table('ratings_test.txt') train_data = NeuralNet1.preprocessing(train_data) test_data = NeuralNet1.preprocessing(test_data) X_train = NeuralNet1.Token(train_data) X_test = NeuralNet1.Token(test_data) self.tokenizer.fit_on_texts(X_train) with open(filepath + '/tokenizer.pickle', 'wb') as handle: pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # label 데이터 생성 Y_train = np.array(train_data['label']) Y_test = np.array(test_data['label']) X_train, Y_train = NeuralNet1.rmEmpty(X_train, Y_train) # padding X_train = pad_sequences(X_train, maxlen=100, padding='post') X_test = pad_sequences(X_test, maxlen=100, padding='post') return X_train, Y_train, X_test, Y_test def test_inputdata(self, filename): # 한 줄에 문장이 하나 있다고 가정, # 데이터 형식은 해당 데이터 형식을 따름 """ id document label 19238 영화가 재밌네요 1 1234 재미없어요 0... -> 단, predict해야하는 경우 label은 빈칸으로 둠. """ test_data = pd.read_table(filename) with open(filepath + '/tokenizer.pickle', 'rb') as handle: self.tokenizer = pickle.load(handle) self.model.load_weights(filepath + '/best_model.h5') print("Load tokenizer and model complete, will predict data") # print(test_data) for i in range(len(test_data)): result = self.sentiment_predict(test_data['document'][i]) test_data.loc[i, ['label']] = result test_data.to_csv("result.txt", sep=' ', float_format='%.0f') def init_neuralnet(self): # 하이퍼파라미터는 여기서 조절 embedding_dim = 256 hidden_dim = 512 dropout_rate = 0.6 # Model input = Input(shape=(100, )) x = Embedding(2542, embedding_dim)(input) x = Dropout(dropout_rate)(x) x = Conv1D(hidden_dim, 5, padding="same")(x) x = BatchNormalization()(x) x = LeakyReLU()(x) x_res = x x = Bidirectional(LSTM(int(hidden_dim / 2), return_sequences=True))(x) x = x + x_res x = LeakyReLU()(x) x = LSTM(hidden_dim, return_sequences=True)(x) x = GlobalMaxPool1D()(x) x = Dropout(dropout_rate)(x) output = Dense(1, activation='sigmoid')(x) model = Model(inputs=[input], outputs=output) optimizer = Adam(learning_rate=0.001) loss_function = BinaryCrossentropy() model.compile(optimizer=optimizer, loss=loss_function, metrics=['acc']) model.summary() return model @staticmethod def preprocessing(data): data = data.drop_duplicates(subset=['document']) data = data.dropna(how='any') data['document'] = data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "") data['document'] = data['document'].replace('', np.nan) data = data.dropna(how='any') return data @staticmethod def Token(data): now = 0 res = list() print("start token") for sentence in data['document']: if now % 10000 == 0: print(f"token : {now}/{len(data)}") now = now + 1 temp = list() for i in range(len(sentence)): temp.append(sentence[i]) res.append(temp) print("end token") return res @staticmethod def Tokenizing(data, tokenizer): return tokenizer.texts_to_sequences(data) @staticmethod def rmEmpty(data, label): drop_data = [ index for index, sentence in enumerate(data) if len(sentence) < 1 ] data = np.delete(data, drop_data, axis=0) label = np.delete(label, drop_data, axis=0) return data, label def sentiment_predict(self, sentence, justone=False): word_list = [] for letter in sentence: word_list.append(letter) word_list.insert(0, "<s>") word_list.insert(len(word_list), "<e>") encoded = self.tokenizer.texts_to_sequences([word_list]) padding_sentence = pad_sequences(encoded, maxlen=100) score = float(self.model.predict(padding_sentence)) if not justone: if score > 0.5: return 1 else: return 0 elif justone: return score
print('단어 집합(vocabulary)의 크기 :',total_cnt) print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt)) print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100) print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100) vocab_size = total_cnt - rare_cnt + 1 print('단어 집합의 크기 :',vocab_size) y_train = np.array(train_data['label']) y_test = np.array(test_data['label']) ''' max_words = 35000 tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) print(X_train[:3]) print(X_test[:3]) print(train_data['label'][:40]) ################################################# ################################################# # train_use = pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False) # train_use['x_train'] = X_train # train_use['y_train'] = train_data['label'] # # train_use.to_csv("./data/train_d.csv", mode='w', index = False, header = False) # 다음으로는 y값으로 들어갈 label -1, 0, 1을 컴퓨터가 보고 알수 있도록 one-hot encoding
from tensorflow.keras.preprocessing.text import Tokenizer # Define a Keras Tokenizer en_tok = Tokenizer() # Fit the tokenizer on some text en_tok.fit_on_texts(en_text) for w in ["january", "apples", "summer"]: # Get the word ID of word w id = en_tok.word_index[w] # Print the word and the word ID print(w, " has id: ", id)
def read_data(train_file, VAC_DIR, MAX_SEQUENCE_LENGTH): # MAX_SEQUENCE_LENGTH = 1200 # 每个文本或者句子的截断长度,只保留1000个单词 MAX_NUM_WORDS = 20000 # 用于构建词向量的词汇表数量 EMBEDDING_DIM = 100 # 词向量维度 VALIDATION_SPLIT = 0.3 # 构建词向量索引 print("Indexing word vectors.") embeddings_index = {} with open(VAC_DIR, encoding="utf-8") as f: for line in f: values = line.split() word = values[0] # 单词 coefs = np.asarray(values[1:], dtype='float32') # 单词对应的向量 embeddings_index[word] = coefs # 单词及对应的向量 print('Found %s word vectors.' % len(embeddings_index)) #400000个单词和词向量 print('预处理文本数据集') texts = [] # 训练文本样本的list labels = [] # 标签list #读取训练数 data = pd.read_csv(train_file) texts = data['Item'].tolist() labels = data['Tag'].replace('non-LN', 0).replace('LN', 1).tolist() print("Found %s texts %s label_id." % (len(texts), len(labels))) # 19997个文本文件 # 向量化文本样本 tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) # fit_on_text(texts) 使用一系列文档来生成token词典,texts为list类,每个元素为一个文档。就是对文本单词进行去重后 tokenizer.fit_on_texts(texts) # texts_to_sequences(texts) 将多个文档转换为word在词典中索引的向量形式,shape为[len(texts),len(text)] -- (文档数,每条文档的长度) sequences = tokenizer.texts_to_sequences(texts) print(sequences[0]) print(len(sequences)) # 19997 word_index = tokenizer.word_index # word_index 一个dict,保存所有word对应的编号id,从1开始 print("Founnd %s unique tokens." % len(word_index)) # 174074个单词 # ['the', 'to', 'of', 'a', 'and', 'in', 'i', 'is', 'that', "'ax"] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] print(list(word_index.keys())[0:10], list(word_index.values())[0:10]) # ######非常好用的函数,直接用 data = pad_sequences( sequences, maxlen=MAX_SEQUENCE_LENGTH) # 长度超过MAX_SEQUENCE_LENGTH则截断,不足则补0 labels = to_categorical(np.asarray(labels)) print("训练数据大小为:", data.shape) # (19997, 1000) print("标签大小为:", labels.shape) # (19997, 20) # 将训练数据划分为训练集和验证集 indices = np.arange(data.shape[0]) np.random.shuffle(indices) # 打乱数据 data = data[indices] labels = labels[indices] num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) # 训练数据 x_train = data[:-num_validation_samples] y_train = labels[:-num_validation_samples] # 验证数据 x_val = data[-num_validation_samples:] y_val = labels[-num_validation_samples:] # 准备词向量矩阵 num_words = min(MAX_NUM_WORDS, len(word_index) + 1) # 词汇表数量 embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) # 20000*100 for word, i in word_index.items(): if i >= MAX_NUM_WORDS: # 过滤掉根据频数排序后排20000以后的词 continue embedding_vector = embeddings_index.get(word) # 根据词向量字典获取该单词对应的词向量 if embedding_vector is not None: embedding_matrix[i] = embedding_vector return x_train, y_train, x_val, y_val, num_words, embedding_matrix
def train(self, bucket_name, key): dataset = pd.read_csv(self.dataset_location) X = [] sentences = list(dataset['text']) for sen in sentences: X.append(pre.Preprocess(sen).preprocess_text()) y = dataset['label'] encoder = LabelBinarizer() y = encoder.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) tokenizer = Tokenizer(num_words=10000) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) maxlen = 100 X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) X_test = pad_sequences(X_test, padding='post', maxlen=maxlen) embedding_dict = dict() with open(self.embedding_location, encoding='UTF-8') as glove_file: for line in glove_file: records = line.split() word = records[0] vector_dimension = np.asarray(records[1:], dtype='float32') embedding_dict[word] = vector_dimension vocab_size = len(tokenizer.word_index) + 1 embedding_matrix = np.zeros((vocab_size, 100)) for word, index in tokenizer.word_index.items(): embedding_vector = embedding_dict.get(word) if embedding_vector is not None: embedding_matrix[index] = embedding_vector model = Sequential([ Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False), Bidirectional( LSTM(50, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)), Bidirectional( LSTM(54, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)), Bidirectional(LSTM(60, dropout=0.3, recurrent_dropout=0.3)), Dense(64, activation="relu"), Dense(7, activation="softmax") ]) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', Precision(), Recall()]) history = model.fit(X_train, y_train, batch_size=128, epochs=100, verbose=1, validation_split=0.2) with tempfile.TemporaryFile() as fp: dump(model, fp) fp.seek(0) self.s3.Bucket('team07-public').put_object( Body=fp.read(), Bucket='team08-public', Key='model/model_final.model') fp.close() with tempfile.TemporaryFile() as fp: dump(encoder.classes_, fp) fp.seek(0) self.s3.Bucket('team07-public').put_object( Body=fp.read(), Bucket='team08-public', Key='model/class_names.npy') fp.close() with tempfile.TemporaryFile() as fp: dump(tokenizer, fp) fp.seek(0) self.s3.Bucket('team07-public').put_object( Body=fp.read(), Bucket='team08-public', Key='model/tokenizer.tokenizer') fp.close()
temp_X = [] temp_X = kor.morphs(sentence) # 토큰화 #stopword에 등록한 조사 제거 temp_X = [word for word in temp_X if not word in stopwords] train_data_document.append(temp_X) X_input = [] for sentence in df['document']: temp_X = [] temp_X = kor.morphs(sentence) # 토큰화 # stopword에 등록한 조사 제거 temp_X = [word for word in temp_X if not word in stopwords] X_input.append(temp_X) #정수 인코딩과정 텍스트가 1234 같은 정수로 바뀜 tokenizer = Tokenizer() tokenizer.fit_on_texts(train_data_document) # threshold 의 정수값보다 낮게 반복되는 단어를 제거 threshold = 2 total_cnt = len(tokenizer.word_index) # 단어의 수 rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트 total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합 rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합 # 단어와 빈도수의 쌍(pair)을 key와 value로 받는다. for key, value in tokenizer.word_counts.items(): total_freq = total_freq + value # 단어의 등장 빈도수가 threshold보다 작으면 if (value < threshold):
oov_tok = '<OOV>' training_portion = 0.8 #Preprocessing articles = df['text_without_stopwords'] authors = df['author'] train_size = int(len(articles) * training_portion) train_articles = articles[0: train_size] train_authors = authors[0: train_size] validation_articles = articles[train_size:] validation_authors = authors[train_size:] tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(train_articles) word_index = tokenizer.word_index vocab_size=len(word_index) train_sequences = tokenizer.texts_to_sequences(train_articles) train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) validation_sequences = tokenizer.texts_to_sequences(validation_articles) validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) author_tokenizer = Tokenizer() author_tokenizer.fit_on_texts(authors) training_author_seq = np.array(author_tokenizer.texts_to_sequences(train_authors)) validation_author_seq = np.array(author_tokenizer.texts_to_sequences(validation_authors))
# Step 4 - Sort the rank and pick top sentences ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # print("Indexes of top ranked_sentence order are ", ranked_sentence) for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) # print(" ".join(ranked_sentence[i][1])) return summarize_text # Step 5 - Offcourse, output the summarize texr # print("Summarize Text: \n", ". ".join(summarize_text)) x_tokenizer = Tokenizer() x_tokenizer.fit_on_texts(list(x_tr)) thresh = 4 cnt = 0 tot_cnt = 0 freq = 0 tot_freq = 0 for key, value in x_tokenizer.word_counts.items(): tot_cnt = tot_cnt + 1 tot_freq = tot_freq + value if (value < thresh): cnt = cnt + 1 freq = freq + value
def train_new_model(self, texts, context_labels=None, num_epochs=50, gen_epochs=1, batch_size=128, dropout=0.0, train_size=1.0, validation=True, save_epochs=0, multi_gpu=False, **kwargs): self.config = self.default_config.copy() self.config.update(**kwargs) print("Training new model w/ {}-layer, {}-cell {}LSTMs".format( self.config['rnn_layers'], self.config['rnn_size'], 'Bidirectional ' if self.config['rnn_bidirectional'] else '')) # Create text vocabulary for new texts # if word-level, lowercase; if char-level, uppercase self.tokenizer = Tokenizer(filters='', lower=self.config['word_level'], char_level=(not self.config['word_level'])) self.tokenizer.fit_on_texts(texts) # Limit vocab to max_words max_words = self.config['max_words'] self.tokenizer.word_index = { k: v for (k, v) in self.tokenizer.word_index.items() if v <= max_words } if not self.config.get('single_text', False): self.tokenizer.word_index[self.META_TOKEN] = len( self.tokenizer.word_index) + 1 self.vocab = self.tokenizer.word_index self.num_classes = len(self.vocab) + 1 self.indices_char = dict((self.vocab[c], c) for c in self.vocab) # Create a new, blank model w/ given params self.model = textgenrnn_model(self.num_classes, dropout=dropout, cfg=self.config) # Save the files needed to recreate the model with open('{}_vocab.json'.format(self.config['name']), 'w', encoding='utf8') as outfile: json.dump(self.tokenizer.word_index, outfile, ensure_ascii=False) with open('{}_config.json'.format(self.config['name']), 'w', encoding='utf8') as outfile: json.dump(self.config, outfile, ensure_ascii=False) self.train_on_texts(texts, new_model=True, via_new_model=True, context_labels=context_labels, num_epochs=num_epochs, gen_epochs=gen_epochs, train_size=train_size, batch_size=batch_size, dropout=dropout, validation=validation, save_epochs=save_epochs, multi_gpu=multi_gpu, **kwargs)
config.logger.info( "Preprocessed data:\n" f" {original_X[0]} → {X[0]}") # Split data X_train, X_val, X_test, y_train, y_val, y_test = data.train_val_test_split( X=X, y=y, val_size=args.val_size, test_size=args.test_size, shuffle=args.shuffle) config.logger.info( "Data splits:\n" f"\tX_train: {len(X_train)}, y_train: {len(y_train)}\n" f"\tX_val: {len(X_val)}, y_val: {len(y_val)}\n" f"\tX_test: {len(X_test)}, y_test: {len(y_test)}") # Tokenizer X_tokenizer = Tokenizer( filters=args.filters, lower=args.lower, char_level=args.char_level, oov_token='<UNK>') X_tokenizer.fit_on_texts(X_train) vocab_size = len(X_tokenizer.word_index) + 1 # +1 for padding token config.logger.info(f"vocab_size: {vocab_size}") # Convert texts to sequences of indices original_text = X_train[0] X_train = np.array(X_tokenizer.texts_to_sequences(X_train)) X_val = np.array(X_tokenizer.texts_to_sequences(X_val)) X_test = np.array(X_tokenizer.texts_to_sequences(X_test)) preprocessed_text = X_tokenizer.sequences_to_texts([X_train[0]])[0] config.logger.info( "Text to indices:\n" f" (raw) {original_text}\n" f" (preprocessed) {preprocessed_text}\n"
import tensorflow as tf from tensorflow import keras from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences sentences = [ 'I love my dog', 'I love my car', 'You love my dog!', 'Do you think my dog is amazing?' ] tokenizer = Tokenizer( num_words=100, oov_token='<oov>') # 100 most common words, out of vocabulary tokenizer.fit_on_texts(sentences) # upper==>lower, takes care of ! word_index = tokenizer.word_index # makes a dictionary print(word_index) sequence = tokenizer.texts_to_sequences(sentences) print(sequence) padded = pad_sequences(sequence, padding='post', maxlen=5, truncating='post') # makes sentences the same size by putting zero where there is no word print(padded) test_data = ['I really love my dog', 'My dog loves my manatee'] test_seq = tokenizer.texts_to_sequences(test_data) test_padded = pad_sequences(test_seq, maxlen=10) print(test_padded)