def lstm_raw():
    print('Loading data...')
    folder_path = r'H:\network_diagnosis_data\cut-1000'
    
    X_t, y_t,dicc=ReadData.ReadRaw2HierData(folder_path,N)
    nb_classes = np.max(y_t)+1
    X_t = ReadData.to_num(X_t,max_features)
    X_train, X_test, y_train, y_test = train_test_split(X_t,y_t, test_size=0.2, random_state= 42 )
    
    print('Pading sequences ')
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)#padding = 'post'
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)#truncating = 'post'
    y_train = to_categorical (y_train,nb_classes)
    y_test = to_categorical (y_test,nb_classes)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)
    
    print('Building model...')
    
    model = Sequential()
    model.add(Embedding(max_features, Embedding_Dim, dropout=0.2))
    model.add(LSTM(Embedding_Dim, dropout_W=0.2, dropout_U=0.2))  # 
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    
    # try using different optimizers and different optimizer configs
    model.compile(loss='categorical_crossentropy', #binary_crossentropy
                  optimizer='adam',
                  metrics=['accuracy'])
    
    from keras.utils.visualize_util import plot
    plot(model, to_file=r'.\data\lstm-model.png')
    print('Training...')
#     model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch,
#               validation_data=(X_test, y_test))
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch,
              validation_split=0.1,verbose=1)
    
    score, acc = model.evaluate(X_test, y_test,
                                batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    
    from keras.utils.visualize_util import plot
    data_today=time.strftime('%Y-%m-%d',time.localtime(time.time()))
    plot(model, to_file=r'.\data\lstm-model'+data_today+'.png')
    json_string = model.to_json()  #等价于 json_string = model.get_config()  
    open('.\data\lstm-model'+data_today+'.json','w+').write(json_string)    
    model.save_weights('.\data\keras-lstm'+data_today+'.h5', overwrite=True)
    print('model saved')
print('Loading data...')
folder_path = r'H:\corpus_trained_model\nltk_data\corpora\movie_reviews'
'''

共读 400 个文件
max_features = 1000
maxlen = 1000  # cut texts after this number of words (among top max_features most common words)
batch_size = 32
num_epoch = 10
data set movie review
Test score: 0.753259503841
Test accuracy: 0.5375
'''

#folder_path = r'H:\network_diagnosis_data\new_cut'
X_t, y_t, dicc = ReadData.ReadRaw2HierData(folder_path, 200)
X_t = ReadData.to_num(X_t, max_features)
X_train, X_test, y_train, y_test = train_test_split(X_t,
                                                    y_t,
                                                    test_size=0.2,
                                                    random_state=42)

print('Pading sequences ')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)  #padding = 'post'
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)  #truncating = 'post'
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Building model...')
Embedding_Dim = 50
model = Sequential()
    print type(X_t), type(y_t)

    X_train, X_test, y_train, y_test = train_test_split(X_t,
                                                        y_t,
                                                        test_size=0.2,
                                                        random_state=42)
    print(len(X_train), 'train samples')
    print(len(X_test), 'test samples')
    print 'Fitting'
    classifier = LogisticRegression(C=1.0,
                                    class_weight=None,
                                    dual=False,
                                    fit_intercept=True,
                                    intercept_scaling=1,
                                    penalty='l2',
                                    random_state=None,
                                    tol=0.001)
    classifier.fit(X_train, y_train)
    print classifier.score(X_test, y_test)


if __name__ == '__main__':
    model_path = r'.\data\movie_review-50D-word-vector'
    model_path = r'.\data\cut1000-all-50D-w2v'
    file_path = r'H:\network_diagnosis_data\cut-500'
    x, y, d = ReadData.ReadRaw2HierData(file_path, 5000)
    new_x = average_word_vec(model_path, x)
    test_log(new_x, y)
    new_data = [new_x, y]
    print new_x[0]
    pickle.dump(new_data, open(r'.\data\w2v_replaced-500samples.pkl', 'wb'))
def mlp():
    max_words = 10000  #1000
    max_feature = 300
    batch_size = 32
    nb_epoch = 5
    vec_dim = 50
    print('Loading data...')
    folder_path = r'H:\network_diagnosis_data\cut-500'
    X_t, y_t, dicc = ReadData.ReadRaw2HierData(folder_path, 3000)
    X_t, Y_t = ReadData.shuffle_X_Y(X_t, y_t)
    X_t = ReadData.to_num(X_t, max_feature)
    X_train, X_test, y_train, y_test = train_test_split(X_t,
                                                        y_t,
                                                        test_size=0.2,
                                                        random_state=42)

    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')

    nb_classes = np.max(y_train) + 1
    print(nb_classes, 'classes')

    print('Vectorizing sequence data...')
    # 参考 http://keras-cn.readthedocs.io/en/latest/preprocessing/text/
    tokenizer = Tokenizer(
        nb_words=max_words
    )  #Tokenizer是一个用于向量化文本,或将文本转换为序列(即单词在字典中的下标构成的列表,从1算起)的类。
    X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
    X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    print(
        'Convert class vector to binary class matrix (for use with categorical_crossentropy)'
    )
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)
    print('Y_train shape:', Y_train.shape)
    print('Y_test shape:', Y_test.shape)

    print('Building model...')
    model = Sequential()
    model.add(Dense(
        512, input_shape=(max_words, )))  # 全连接层 ,输入(,max_words);输出(,512)
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(nb_classes))  # 只需要定义输出层个数,分类类别个数
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    history = model.fit(X_train,
                        Y_train,
                        nb_epoch=nb_epoch,
                        batch_size=batch_size,
                        verbose=1,
                        validation_split=0.1)
    score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1)

    print('Saving model...')
    from keras.utils.visualize_util import plot
    data_today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    plot(model, to_file=r'.\data\mlp-model' + data_today + '.png')
    json_string = model.to_json()  #等价于 json_string = model.get_config()
    open('.\data\mlp-model' + data_today + '.json', 'w+').write(json_string)
    model.save_weights('.\data\keras-mlp' + data_today + '.h5', overwrite=True)

    print('Test score:', score[0])
    print('Test accuracy:', score[1])
    #     print (model.predict_classes(X_test,batch_size=batch_size))
    print('---------------------------------')