def vectorizer(text): tokenizer = pickle.load(open('./tokenizer.pkl', 'rb')) text = [text] text = tokenizer.texts_to_sequences(text) text = sequence.pad_sequences(text, maxlen=MAX_SEQUENCE_LENGTH) return text
def convert_text(string_text, to_rgb=False): if not isinstance(string_text, list): string_text = [string_text] with open('save_tokenizer/tokenizer3.pickle', 'rb') as handle: tokenizer = pickle.load(handle) string_seq = tokenizer.texts_to_sequences(string_text) string_pad = sequence.pad_sequences(string_seq, maxlen=686) text_converted = string_pad # string_seq print('Text_converted: ', text_converted) return text_converted
import os # Supress warning and informational messages os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" NUM_WORDS = 6000 # the top most n frequent words to consider SKIP_TOP = 0 # Skip the most words that are likely (the, and, a) MAX_REVIEW_LEN = 400 # Max number of words from a review (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=NUM_WORDS, skip_top=SKIP_TOP) # print a sample # print("econded word sequence:", x_train[3]) x_train = sequence.pad_sequences(x_train, maxlen=MAX_REVIEW_LEN) x_test = sequence.pad_sequences(x_test, maxlen=MAX_REVIEW_LEN) print("x_train.shape", x_train.shape, "x_test.shape:", x_test.shape) model = Sequential() model.add(Embedding(NUM_WORDS, 64)) model.add(LSTM(128)) model.add(Dense(1, activation="sigmoid")) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) BATCH_SIZE = 24 EPOCHS = 5
y_train = le.transform(data_train.category) y_test = le.transform(data_test.category) Y_train = np_utils.to_categorical(y_train) Y_test = np_utils.to_categorical(y_test) ## Tokenize text logging.info("Tokenizing text...") tokenizer = Tokenizer(num_words = vocab_size) tokenizer.fit_on_texts(data_train.text) x_train = tokenizer.texts_to_sequences(data_train.text) x_test = tokenizer.texts_to_sequences(data_test.text) ## Pad sequences logging.info("Transforming tokens into sequences...") max_input_size = len(max(x_train, key = len)) X_train = sequence.pad_sequences(x_train, maxlen = max_input_size) X_test = sequence.pad_sequences(x_test, maxlen = max_input_size) print('x_train shape:', X_train.shape) print('x_test shape:', X_test.shape) X_train_multi = [] X_test_multi = [] for i in range(len(kernel_size)): X_train_multi.append(X_train) X_test_multi.append(X_test) ## Build model # 1. Embeddings layer inputs = Input(shape = (max_input_size, )) x = Embedding(vocab_size, embedding_dims)(inputs) # 2. Convolutional channels for n-grams channels = []
vectorizer = CountVectorizer() sentence_enISEAR = vectorizer.fit_transform(sentence_enISEAR) # Preprocessing dataset # To lower case, remove , and . data = data.str.lower().str.replace(".", "").str.replace(",", "") tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True) tokenizer.fit_on_texts(data) vocab_size = len(tokenizer.word_index) + 1 word_index = tokenizer.word_index num_words = min(MAX_NUM_WORDS, vocab_size) data_enISEAR = tokenizer.texts_to_sequences(data) data_padded = sequence.pad_sequences(data_enISEAR, maxlen=MAX_SEQUENCE_LENGTH, padding='post') embedding_path = '../../../../../Downloads/glove.6B.300d.txt' # embedding_matrix = util.embedding.prepareEmbeddings(word_index, MAX_NUM_WORDS, embedding_path) # util.embedding.saveEmbedding(embedding_matrix, EMBEDDING_FILEe) print('Loading embedding' + EMBEDDING_FILE) embedding_matrix = util.embedding.loadEmbedding(EMBEDDING_FILE) class_weight = { 0: 1.131, 1: 1.000, 2: 1.903, 3: 5.107, 4: 2.019, 5: 3.338,
print(' Folds :', KFOLDS) print(' Runs :', ROUNDS) print('-------------------------------\n') # Tokenize and create word index print('INFO: Loading Dataset') instances = instances.str.lower().str.replace('.', '').str.replace(',', '') tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True) tokenizer.fit_on_texts(instances) vocab_size = len(tokenizer.word_index) + 1 word_index = tokenizer.word_index num_words = min(MAX_NUM_WORDS, vocab_size) instances_sequences = tokenizer.texts_to_sequences(instances) instances_padded = sequence.pad_sequences(instances_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') if (args.testset): instances_test = instances_test.str.lower().str.replace('.', '').str.replace( ',', '') instances_sequences_test = tokenizer.texts_to_sequences(instances_test) instances_padded_test = sequence.pad_sequences(instances_sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post') # Prepare embedding if not present yet EMBEDDING_DIMS = 300 if (args.createembedding): if (not args.createembedding.endswith('.npy')):
from keras.preprocessing.sequence import sequence #这是一个sequence数据处理的库,用来对数据进行一些操作 """----------数据准备----------""" max_features = 10000 maxlen = 500 batch_size = 32 print('Loading data...') #获取到数据 (input_train, y_train), (input_test, y_test) = imdb.load_data( num_words=max_features) print(len(input_train), 'train sequences')#打印数据个数,一共25000调评论,单词都化成数字了,是一个25000xn的列表,其中n长度不定 print(len(input_test), 'test sequences') print('Pad sequences (samples x time)') #数据整形 input_train =sequence.pad_sequences(input_train, maxlen=maxlen)#这里截取的是25000条评论各自的长度,而不是取25000里面500个。?所以说sequence截断的二级维度上的 input_test = sequence.pad_sequences(input_test, maxlen=maxlen) print('input_train shape:', input_train.shape) print('input_test shape:', input_test.shape) """----------网络准备----------""" "embedding layer是一个可变的字典映射,将单词(已经化成整数)映射到一个空间里,使得能够保留之间的关系,10000表示这个空间只接受前10000个常用的,这个层好像只适合用于文本类的信息处理" model=models.Sequential() model.add(layers.Embedding(max_features,32))#max_fetures是10000个常用的单词,这里的32不知道是什么,可能是数据组数,那后面的batch_size有什么用呢 model.add(layers.SimpleRNN(32)) model.add(layers.Dense(1,activation='sigmoid')) model.summary()#打印一下模型 model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']
print(' Folds :', KFOLDS) print(' Runs :', ROUNDS) print('---------------------------------------------------------- \n') # Preprocessing dataset # To lower case, remove , and . text_instances = text_instances.str.lower().str.replace(".","").str.replace(",","") tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True) tokenizer.fit_on_texts(text_instances) vocab_size = len(tokenizer.word_index) + 1 word_index = tokenizer.word_index num_words = min(MAX_NUM_WORDS, vocab_size) text_instances = tokenizer.texts_to_sequences(text_instances) text_instances_padded = sequence.pad_sequences( text_instances, maxlen=MAX_SEQUENCE_LENGTH, padding='post') if (args.testset): text_instances_test = text_instances_test.str.lower().str.replace(".","").str.replace(",","") text_instances_test = tokenizer.texts_to_sequences(text_instances_test) text_instances_padded_test = sequence.pad_sequences( text_instances_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post') if (args.annotate): text_instances_annotate = text_instances_annotate.str.lower().str.replace(".","").str.replace(",","") text_instances_annotate = tokenizer.texts_to_sequences(text_instances_annotate) text_instances_padded_annotate = sequence.pad_sequences( text_instances_annotate, maxlen=MAX_SEQUENCE_LENGTH, padding='post') # Prepare embedding