def tokenlize_text(max_num_words, max_seq_length, x_train): """Tokenlize text. Vectorize a text corpus by transform each text in texts to a sequence of integers. Args: max_num_words: Int, max number of words in the dictionary. max_seq_length: Int, the length of each text sequence, padding if shorter, trim is longer. x_train: List contains text data. Returns: x_train: Tokenlized input data. word_index: Dictionary contains word with tokenlized index. """ from keras_preprocessing.sequence import pad_sequences from keras_preprocessing.text import Tokenizer print("tokenlizing texts...") tokenizer = Tokenizer(num_words=max_num_words) tokenizer.fit_on_texts(x_train) sequences = tokenizer.texts_to_sequences(x_train) word_index = tokenizer.word_index x_train = pad_sequences(sequences, maxlen=max_seq_length) print("data readed and convert to %d length sequences" % max_seq_length) return x_train, word_index
pool_size = 4 # RNN rnn_output_size = 70 # Training batch_size = 256 epochs = 5 print('Loading data...') (x_train, y_train), (x_val, y_val), (x_test, y_test) = sentiment_140_neg.load_data() print('Fitting tokenizer...') tokenizer = Tokenizer() tokenizer.fit_on_texts(np.concatenate((x_train, x_val, x_test))) print('Convert text to sequences') x_train = tokenizer.texts_to_sequences(x_train) x_val = tokenizer.texts_to_sequences(x_val) x_test = tokenizer.texts_to_sequences(x_test) print(len(x_train), 'train sequences') print(len(x_val), 'validation sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_val = sequence.pad_sequences(x_val, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
import matplotlib.pyplot as plt import numpy as np from tensorflow import keras from keras_preprocessing.text import Tokenizer from keras_preprocessing.sequence import pad_sequences tokenizer = Tokenizer() data = open('archive/irish-lyrics-eof.txt').read() corpus = data.lower().split("\n") tokenizer.fit_on_texts(corpus) total_words = len(tokenizer.word_index) + 1 input_sequences = [] for line in corpus: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] input_sequences.append(n_gram_sequence) max_sequence_len = max([len(x) for x in input_sequences]) input_sequences = np.array( pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) xs = input_sequences[:, :-1] labels = input_sequences[:, -1] ys = keras.utils.to_categorical(labels, num_classes=total_words)
from keras.models import load_model from keras_preprocessing.sequence import pad_sequences from keras_preprocessing.text import Tokenizer import numpy as np model = load_model('sentiment_model.h5') test_data = ["A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump"] max_features = 200 tokenizer = Tokenizer(num_words=max_features, split=' ') tokenizer.fit_on_texts(test_data) X = tokenizer.texts_to_sequences(test_data) max_len = 28 X = pad_sequences(X, maxlen=max_len) class_names = ['positive', 'negative'] preds = model.predict(X) print(preds) classes = model.predict_classes(X) print(classes) print(class_names[classes[0]])
encoding='utf-8') train_movie_df = pd.read_csv('movie_reviews/train.tsv', delimiter='\t', encoding='utf-8') train_movie_df = train_movie_df.drop(columns=['PhraseId', 'SentenceId']) test_movie_df = test_movie_df.drop(columns=['PhraseId', 'SentenceId']) train_movie_df['Phrase'] = train_movie_df['Phrase'].apply( lambda x: re.sub('[^a-zA-z0-9\s]', '', x.lower())) test_movie_df['Phrase'] = test_movie_df['Phrase'].apply( lambda x: re.sub('[^a-zA-z0-9\s]', '', x.lower())) max_fatures = 2000 tokenizer = Tokenizer(num_words=max_fatures, split=' ') tokenizer.fit_on_texts(train_movie_df['Phrase'].values) X_train = tokenizer.texts_to_sequences(train_movie_df['Phrase'].values) X_train = pad_sequences(X_train) tokenizer.fit_on_texts(test_movie_df['Phrase'].values) X_test = tokenizer.texts_to_sequences(test_movie_df['Phrase'].values) X_test = pad_sequences(X_train) print("handing data") embed_dim = 128 lstm_out = 196 def create_model(): sequential_model = Sequential() sequential_model.add(
#train_df = pd.concat([pos_df, neg_df]) train_df = pd.concat([pos_df, neg_df, neu_df]) train_df = train_df.reset_index(drop=True) x = train_df['Comment'].values y = train_df['Sentiment Rating'].values print(train_df.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=22) tokenizer = Tokenizer(num_words=20000) tokenizer.fit_on_texts(train_df['Comment']) X_train = tokenizer.texts_to_sequences(x_train) X_test = tokenizer.texts_to_sequences(x_test) seq_lens = [len(s) for s in X_train] print(max(seq_lens)) #pypl.hist(seq_lens,bins=50) pypl.hist([l for l in seq_lens if l < 200], bins=50) pypl.show() X_train = keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=150) X_test = keras.preprocessing.sequence.pad_sequences(X_test, padding='post',
def build_tokenizer(lines): tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer
df["text"] = df["text"].apply(clean_text) #corpus=[] #for index in range(len(df["headlines"])): # corpus.append(preprocessing.text_preprocessing(df["headlines"][index])) df.drop_duplicates(subset=["text"], inplace=True) df.dropna(inplace=True) print(df.head) voc_size=5000 sent_length=100 X_train, X_test, y_train, y_test = train_test_split(df.text, df.target, test_size=0.3, random_state=37) tk = Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n', lower=True, split=" ") tk.fit_on_texts(X_train) X_train_seq = tk.texts_to_sequences(X_train) X_test_seq = tk.texts_to_sequences(X_test) X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=100) X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=100) model = Sequential() # initilaizing the Sequential nature for CNN model print(len(tk.index_word)) model.add(Embedding(len(tk.index_word), 32, input_length=100)) model.add(LSTM(100)) model.add(Dense(1, activation='sigmoid')) model.compile( loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) print(X_train_seq_trunc)
test_label = np.load(open(BASE_DIR + "/test_label.npy")).tolist() train_label_encoder = preprocessing.LabelEncoder() train_label_encoder.fit(train_label) joblib.dump(train_label_encoder, DATA_DIR + '/label_encoder.pkl') train_label = train_label_encoder.transform(train_label) test_label = train_label_encoder.transform(test_label) label_dict = dict( zip(list(train_label_encoder.classes_), train_label_encoder.transform(list(train_label_encoder.classes_)))) print('[INFO] Label dict:', label_dict) tokenizer = Tokenizer(MAX_NB_WORDS) tokenizer.fit_on_texts(train_text) sequences = tokenizer.texts_to_sequences(train_text) word_index = tokenizer.word_index print('[INFO] Found %s unique word tokens' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) train_label = to_categorical(np.asarray(train_label)) print('[INFO] Shape of data tensor:', data.shape) print('[INFO] Shape of label tensor:', train_label.shape) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] # rearrange train text to shuffled indices train_label = train_label[indices] # rearrange test text to shuffled indices
def create_tokenizer(descriptions): lines = to_lines(descriptions) tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer
#plt.ylim([0, 10]) plt.xlabel('Epoch') plt.ylabel('Error [TPSA]') plt.legend() plt.grid(True) plt.savefig(filename) os.environ['KMP_DUPLICATE_LIB_OK']='True' os.environ["CUDA_VISIBLE_DEVICES"]="0" df = pd.read_csv("sample_training_tpsa_caw.csv") df = df.sample(frac=1) texts = df.iloc[:,0].to_list() tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK') tk.fit_on_texts(texts) print(tk.word_index) print("word index len: ", len(tk.word_index)) sequences = tk.texts_to_sequences(texts) #print(texts[0]) #print(sequences[0]) lens = [len(x) for i, x in enumerate(sequences)] #print(lens) print("max: ", max(lens)) sum_ser = reduce(lambda x, y: x + y, lens) print("sum ", sum_ser) avg_len = (sum_ser * 1.0)/(len(lens)) print("avg_len: ", avg_len)
with open("dynamic_feature_train.csv.pkl", "rb") as f: labels = pickle.load(f) #train 类别 files = pickle.load(f) #files 文件api vectorizer = TfidfVectorizer( ngram_range=(1, 5), min_df=3, max_df=0.9, ) # tf-idf特征抽取ngram_range=(1,5),如果词的df超过某一阈值则被词表过滤 train_features = vectorizer.fit_transform(files) #将文本中的词语转换为词频矩阵 ,先拟合数据再标准化 tfidftransformer_path = 'tfidf_transformer.pkl' with open(tfidftransformer_path, 'wb') as fw: pickle.dump(vectorizer, fw) #deep learning with open("dynamic_feature_test.csv.pkl", "rb") as f: test_labels = pickle.load(f) outfiles = pickle.load(f) tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', split=' ', char_level=False, oov_token=None) #Tokenizer是一个用于向量化文本,或将文本转换为序列 tokenizer.fit_on_texts(files) tokenizer.fit_on_texts(outfiles) pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
enc = LabelEncoder() enc.fit(training_labels) training_labels = enc.transform(training_labels) ##print(training_labels) vocab_size = 10000 embedding_dim = 16 max_len = 20 trunc_type = 'post' #padding = 'pre' oov_token = "<OOV>" tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(training_sentences) padded = pad_sequences(sequences, truncating=trunc_type, maxlen=max_len) classes = len(labels) ''' model = tf.keras.models.Sequential() model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_len)) model.add(tf.keras.layers.GlobalAveragePooling1D()) model.add(tf.keras.layers.Dense(16, activation='relu')) model.add(tf.keras.layers.Dense(16, activation='relu')) model.add(tf.keras.layers.Dense(classes, activation='softmax'))
x_train['consumer_review'], x_train['polarity_label'], test_size=0.3) #converting to an array x_train = (x_train.values.tolist()) x_test = (x_test.values.tolist()) y_train = (y_train.values.tolist()) y_test = (y_test.values.tolist()) #tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train) word_index = tokenizer.word_index total_size = len(word_index) + 1 print(total_size) #texts to sequence x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) #add padding to ensure the same lenght max_length = 100
batch_size = 128 if __name__ == '__main__': df = pd.read_csv("data/fake-news-pair-classification-challenge/train.csv", nrows=40000) train = df.loc[:, ['title1_zh', 'title2_zh', 'label']] # 1. 分词,去停用词,去标点 train['title1_tokenized'] = train['title1_zh'].apply(cut_word) train['title2_tokenized'] = train['title2_zh'].apply(cut_word) # 2. 构建词典,padding x = pd.concat([train['title1_tokenized'], train['title2_tokenized']]) tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(x) encoded1 = tokenizer.texts_to_sequences(train['title1_tokenized']) encoded2 = tokenizer.texts_to_sequences(train['title2_tokenized']) input_len = 25 pad1 = pad_sequences(encoded1, maxlen=input_len) pad2 = pad_sequences(encoded2, maxlen=input_len) label = {'unrelated': 0, 'agreed': 1, 'disagreed': 2} y = train['label'].apply(lambda x: label[x]) x1_train_all, x1_test, x2_train_all, x2_test, y_train_all, y_test = train_test_split( pad1, pad2, y) x1_train, x1_val, x2_train, x2_val, y_train, y_val = train_test_split( x1_train_all, x2_train_all, y_train_all) x1_train = tf.convert_to_tensor(x1_train, dtype=tf.float32) x2_train = tf.convert_to_tensor(x2_train, dtype=tf.float32)