def lstm_raw(): print('Loading data...') folder_path = r'H:\network_diagnosis_data\cut-1000' X_t, y_t,dicc=ReadData.ReadRaw2HierData(folder_path,N) nb_classes = np.max(y_t)+1 X_t = ReadData.to_num(X_t,max_features) X_train, X_test, y_train, y_test = train_test_split(X_t,y_t, test_size=0.2, random_state= 42 ) print('Pading sequences ') X_train = sequence.pad_sequences(X_train, maxlen=maxlen)#padding = 'post' X_test = sequence.pad_sequences(X_test, maxlen=maxlen)#truncating = 'post' y_train = to_categorical (y_train,nb_classes) y_test = to_categorical (y_test,nb_classes) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Building model...') model = Sequential() model.add(Embedding(max_features, Embedding_Dim, dropout=0.2)) model.add(LSTM(Embedding_Dim, dropout_W=0.2, dropout_U=0.2)) # model.add(Dense(nb_classes)) model.add(Activation('softmax')) # try using different optimizers and different optimizer configs model.compile(loss='categorical_crossentropy', #binary_crossentropy optimizer='adam', metrics=['accuracy']) from keras.utils.visualize_util import plot plot(model, to_file=r'.\data\lstm-model.png') print('Training...') # model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch, # validation_data=(X_test, y_test)) model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch, validation_split=0.1,verbose=1) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size) print('Test score:', score) print('Test accuracy:', acc) from keras.utils.visualize_util import plot data_today=time.strftime('%Y-%m-%d',time.localtime(time.time())) plot(model, to_file=r'.\data\lstm-model'+data_today+'.png') json_string = model.to_json() #等价于 json_string = model.get_config() open('.\data\lstm-model'+data_today+'.json','w+').write(json_string) model.save_weights('.\data\keras-lstm'+data_today+'.h5', overwrite=True) print('model saved')
print('Loading data...') folder_path = r'H:\corpus_trained_model\nltk_data\corpora\movie_reviews' ''' 共读 400 个文件 max_features = 1000 maxlen = 1000 # cut texts after this number of words (among top max_features most common words) batch_size = 32 num_epoch = 10 data set movie review Test score: 0.753259503841 Test accuracy: 0.5375 ''' #folder_path = r'H:\network_diagnosis_data\new_cut' X_t, y_t, dicc = ReadData.ReadRaw2HierData(folder_path, 200) X_t = ReadData.to_num(X_t, max_features) X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42) print('Pading sequences ') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) #padding = 'post' X_test = sequence.pad_sequences(X_test, maxlen=maxlen) #truncating = 'post' print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Building model...') Embedding_Dim = 50 model = Sequential()
print type(X_t), type(y_t) X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42) print(len(X_train), 'train samples') print(len(X_test), 'test samples') print 'Fitting' classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.001) classifier.fit(X_train, y_train) print classifier.score(X_test, y_test) if __name__ == '__main__': model_path = r'.\data\movie_review-50D-word-vector' model_path = r'.\data\cut1000-all-50D-w2v' file_path = r'H:\network_diagnosis_data\cut-500' x, y, d = ReadData.ReadRaw2HierData(file_path, 5000) new_x = average_word_vec(model_path, x) test_log(new_x, y) new_data = [new_x, y] print new_x[0] pickle.dump(new_data, open(r'.\data\w2v_replaced-500samples.pkl', 'wb'))
def mlp(): max_words = 10000 #1000 max_feature = 300 batch_size = 32 nb_epoch = 5 vec_dim = 50 print('Loading data...') folder_path = r'H:\network_diagnosis_data\cut-500' X_t, y_t, dicc = ReadData.ReadRaw2HierData(folder_path, 3000) X_t, Y_t = ReadData.shuffle_X_Y(X_t, y_t) X_t = ReadData.to_num(X_t, max_feature) X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') nb_classes = np.max(y_train) + 1 print(nb_classes, 'classes') print('Vectorizing sequence data...') # 参考 http://keras-cn.readthedocs.io/en/latest/preprocessing/text/ tokenizer = Tokenizer( nb_words=max_words ) #Tokenizer是一个用于向量化文本,或将文本转换为序列(即单词在字典中的下标构成的列表,从1算起)的类。 X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print( 'Convert class vector to binary class matrix (for use with categorical_crossentropy)' ) Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) print('Y_train shape:', Y_train.shape) print('Y_test shape:', Y_test.shape) print('Building model...') model = Sequential() model.add(Dense( 512, input_shape=(max_words, ))) # 全连接层 ,输入(,max_words);输出(,512) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(nb_classes)) # 只需要定义输出层个数,分类类别个数 model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1) score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1) print('Saving model...') from keras.utils.visualize_util import plot data_today = time.strftime('%Y-%m-%d', time.localtime(time.time())) plot(model, to_file=r'.\data\mlp-model' + data_today + '.png') json_string = model.to_json() #等价于 json_string = model.get_config() open('.\data\mlp-model' + data_today + '.json', 'w+').write(json_string) model.save_weights('.\data\keras-mlp' + data_today + '.h5', overwrite=True) print('Test score:', score[0]) print('Test accuracy:', score[1]) # print (model.predict_classes(X_test,batch_size=batch_size)) print('---------------------------------')