def main(unused_argv): # 处理embedding,word2id,id2word config.embedding_file = config.embedding_file or 'glove.6B.{}d.txt'.format( config.embedding_size) embed_matrix, word2id, id2word = get_embedding_word2id_id2word( config.embedding_dir, config.embedding_file, config.embedding_size) # 处理原始数据,保存处理后的数据 prepro(config) # 创建模型 qa_model = Model(config, embed_matrix, word2id, id2word) # 配置session信息 sess_config = tf.ConfigProto( allow_soft_placement=True) # 是否打印设备分配日志;如果你指定的设备不存在,允许TF自动分配设备 sess_config.gpu_options.allow_growth = True # 动态申请显存 log.info("### 『{}』 model is working in『{}』 mode,batch_size:『{}』###".format( config.experiment_name, config.mode, config.batch_size)) if config.mode == 'train': with tf.Session(config=sess_config) as sess: initial_model(sess, config.ckpt_path) qa_model.train(sess) elif config.mode == 'show_examples': with tf.Session(config=sess_config) as sess: # 1.获取最好的保存的模型 initial_model(sess, config.best_model_ckpt_path, expect_exists=True) # 2.进行预测,保存预测结果 dev_loss, dev_f1, dev_em = qa_model.validate(sess) print(dev_loss, dev_f1, dev_em) elif config.mode == 'official_eval': with tf.Session(config=sess_config) as sess: # 1.获取最好的保存的模型 initial_model(sess, config.best_model_ckpt_path, expect_exists=True) # 2.进行预测,保存预测结果 uuid2ans = qa_model.test(sess) with codecs.open(config.predict_answer_file, 'w', encoding='utf-8') as f: ans = unicode(json.dumps(uuid2ans, ensure_ascii=False)) f.write(ans) # 3.评价 print_test_score() else: raise Exception("未知的mode:{}".format(config.mode))
def main(_): config = flags.FLAGS if config.mode == "train": train(config) elif config.mode == "preprocess": prepro(config) elif config.mode == "debug": config.train_steps = 2 config.dev_steps = 1 config.dev_period = 1 config.save_period = 1 train(config) elif config.mode == "evaluate": pass
def Create_Pairs(): # UM = domain_adaptation_task # cc = repetition # SpC = sample_per_class source_path = r'./data/20HP' target_path = r'./data/10HP' X_train_source, y_train_source, valid_sourceX, valid_sourceY, test_sourceX, test_sourceY = preprocess.prepro( d_path=source_path, length=1024, number=512, normal=False, rate=[0.9, 0.05, 0.05], enc=False, enc_step=28) X_train_source, valid_sourceX, test_sourceX = X_train_source[:, :, np. newaxis], valid_sourceX[:, :, np . newaxis], test_sourceX[:, :, np . newaxis] X_train_target, y_train_target, valid_targetX, valid_targetY, test_targetX, test_targetY = preprocess.prepro( d_path=target_path, length=1024, number=128, normal=False, rate=[0.05, 0.05, 0.9], enc=False, enc_step=28) X_train_target, valid_targetX, test_targetX = X_train_target[:, :, np. newaxis], valid_targetX[:, :, np . newaxis], test_targetX[:, :, np . newaxis] np.save('./test/test_targetX.npy', test_targetX) np.save('./test/test_targetY.npy', test_targetY) Training_P = [] #创建一个列表 Training_N = [] # print(y_train_source) # print(y_train_target) for trs in range(len(y_train_source)): for trt in range(len(y_train_target)): if np.argmax(y_train_source[trs]) == np.argmax( y_train_target[trt]): Training_P.append([trs, trt]) #在列表末尾添加元素,而不影响列表中的其他所有元素 else: Training_N.append([trs, trt]) random.shuffle(Training_N) #shuffle()方法将序列的所有元素随机排序 Training = Training_P + Training_N[:3 * len(Training_P)] #列表组合 random.shuffle(Training) X1 = np.zeros([len(Training), 1024, 1], dtype='float32') X2 = np.zeros([len(Training), 1024, 1], dtype='float32') y1 = np.zeros([len(Training)]) #默认为浮点型,1维的,len y2 = np.zeros([len(Training)]) p = np.zeros([len(Training)]) for i in range(len(Training)): in1, in2 = Training[i] X1[i, :] = X_train_source[in1, :] X2[i, :] = X_train_target[in2, :] y1[i] = np.argmax(y_train_source[in1]) y2[i] = np.argmax(y_train_target[in2]) if np.argmax(y_train_source[in1]) == np.argmax(y_train_target[in2]): p[i] = 1 if not os.path.exists( './pairs' ): #os.path.exists(path),如果path存在,返回True;如果path不存在,返回False。 os.makedirs('./pairs') #os.makedirs()方法用于递归创建目录 np.save('./pairs/X1.npy', X1) #写入文件X1=train_source np.save('./pairs/X2.npy', X2) #X2=train_target np.save('./pairs/y1.npy', y1) #y1=train_source np.save('./pairs/y2.npy', y2) #y2=train_target np.save('./pairs/p.npy', p)
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D from keras.models import Sequential from keras.utils import plot_model from keras.regularizers import l2 import preprocess import numpy as np path = r'../data/0HP' # 得到训练集、验证集、测试集 的数据 x_train, y_train, x_valid, y_valid, x_test, y_test = preprocess.prepro( d_path=path, length=2048, number=1000, normal=True, rate=[0.7, 0.2, 0.1], enc=True, enc_step=28) # 将训练集、验证集、测试集 增加一个维度 x_train, x_valid, x_test = x_train[:, :, np. newaxis], x_valid[:, :, np. newaxis], x_test[:, :, np. newaxis] # 得到输入的维度 input_shape = x_train.shape[1:] def lstm_classification(): model = Sequential() # 在lstm之前加上一层卷积来提取特征,注意要最大池化,才可以提升准确率 model.add(
# 训练参数 batch_size = 128 epochs = 12 num_classes = 10 length = 2048 BatchNorm = True # 是否批量归一化 number = 1000 # 每类样本的数量 normal = True # 是否标准化 rate = [0.7, 0.2, 0.1] # 测试集验证集划分比例 date = time.strftime("%Y%m%d", time.localtime()) mark = time.strftime("%Y%m%d_%H%M", time.localtime()) path = r'data\0HP' x_train, y_train, x_valid, y_valid, x_test, y_test = preprocess.prepro(d_path=path, length=length, number=number, normal=normal, rate=rate, enc=True, enc_step=28) x_train, x_valid, x_test = x_train[:, :, np.newaxis], x_valid[:, :, np.newaxis], x_test[:, :, np.newaxis] input_shape = x_train.shape[1:] print('训练样本维度:', x_train.shape) print(x_train.shape[0], '训练样本个数') print('验证样本的维度', x_valid.shape) print(x_valid.shape[0], '验证样本个数') print('测试样本的维度', x_test.shape) print(x_test.shape[0], '测试样本个数') model_name = "lstm_diagnosis-{}".format(mark)
from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score from sklearn import model_selection from sklearn.model_selection import train_test_split import datetime import warnings import time import make_feature_and_label, preprocess, lgb, DNN warnings.filterwarnings('ignore') batch_size = 100 nb_classes = 10 nb_epoch = 20 # 读取预处理后数据 print(u'预处理部分') trian_data, trian, all_customer = preprocess.prepro() #result['min_time_point'] = result['order_pay_date_1'] + datetime.timedelta(days=180) #result['max_time_point'] = pd.to_datetime('2014-01-01')+datetime.timedelta(days=180) validata_date_begin = trian['order_pay_date'].max() - datetime.timedelta( days=180) # 简单的特征生成部分代码 # 生成训练数据和提交数据 train_history = trian[(trian['order_pay_date'].astype(str) <= '2013-06-30')] online_history = trian[(trian['order_pay_date'].astype(str) <= '2013-12-31')] # train_label 相对于 train_history 的未来180天的数据 train_label = trian[trian['order_pay_date'].astype(str) >= '2013-07-01'] # print(u'特征工程及训练集/测试集部分') start = time.perf_counter() train = make_feature_and_label.make_feature_and_label(train_history, train_label, False)
activation='relu')(inputs) # , padding = 'same',必要时使用0进行填充 # dropout: 防止过拟合,神经元以0.3的概率停止工作 x = Dropout(0.3)(x) # 输出依旧为 20*64 # Bidirectional RNN 双向循环网络 lstm_out = Bidirectional(LSTM(64, return_sequences=True))(x) lstm_out = Dropout(0.3)(lstm_out) lstm_mul = Flatten()(lstm_out) output = Dense(1, activation='sigmoid')(lstm_mul) model = Model(inputs=[inputs], outputs=output) return model path = '../pollution.csv' rate = [0.7, 0.2, 0.1] train_x, train_y, valid_x, valid_y, test_x, test_y = preprocess.prepro( path, rate) model = baselineModel2() model.compile(optimizer='adam', loss='mse') model.fit([train_x], train_y, epochs=5, batch_size=64, validation_data=[valid_x, valid_y]) # model.fit(train_x, train_y, epochs=5, batch_size=64, validation_split=0.1) predict = model.predict(test_x) loss = 0 for i in range(len(predict)): loss = loss + (predict[i] - test_y[i]) * (predict[i] - test_y[i]) print(loss) # print(len(test_y), len(predict)) plot_model(model=model, to_file='attentionLSTM.png', show_shapes=True)