def deal_data(conn, addr): while True: # 接收1024个字节,并解码(bytes->str) data = conn.recv(1024).decode() if not data: break print('学号:', data[0:10]) print('密码:', data[10:]) #将得到的学号,密码,验证码发送到图书馆网站,得到数据 dataPro = Data_process(data) dataPro.login(dataPro) print("登录成功!正在获取数据...") list1 = dataPro.lscx(dataPro) #time.sleep(100) #将数据返回到客户端 for i in range(len(list1)): for j in range(len(list1[i])): conn.send(list1[i][j].encode()) #结束符,若接收到这个数据就关闭客户端 end = "-1" conn.sendall(end.encode()) print("数据已发送") break conn.close() print("已断开连接的客户端对象:", addr) print("\n")
def train(): dataprocess = Data_process(FLAGS.path_input,FLAGS.path_stopword,FLAGS.path_word2vec_model, FLAGS.word2vev_size,FLAGS.max_length,FLAGS.min_counter,FLAGS.rate) trainReviews, trainLabels, evalReviews, evalLabels, wordembedding = dataprocess.dataGen() with tf.Graph().as_default(): cnn = Bilstm(vocab_size=len(wordembedding),hidden_dims=[256,256],embedding=wordembedding, hidden_size=FLAGS.hidden_size,max_length=FLAGS.max_length,dropoutKeepProb=FLAGS.dropoutrate, numClass=FLAGS.numclass) globalStep = tf.Variable(0, name="globalStep", trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.learnrate) # 计算梯度,得到梯度和变量 gradsAndVars = optimizer.compute_gradients(cnn.loss) # 将梯度应用到变量下,生成训练器,对参数进行更新 saver = tf.train.Saver() trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: sess.run(tf.global_variables_initializer()) recall_max = 0 for i in range(FLAGS.epoch): for batch in dataprocess.nextBatch(trainReviews,trainLabels,FLAGS.batch_size): feed_dict = { cnn.input_X: batch[0], cnn.input_Y: batch[1] } predictions,loss,_,ouput,step = sess.run([cnn.predictions,cnn.loss,trainOp,cnn.output,globalStep], feed_dict) acc = accuracy_score(batch[1], ouput) precision = precision_score(batch[1], ouput, average='weighted') recall = recall_score(batch[1], ouput, average='micro') timeStr = datetime.datetime.now().isoformat() print("{}, iter: {}, step: {}, loss: {},acc: {}, precision: {}, recall: {}" .format(timeStr, i, step, loss, acc, precision, recall)) acces = [] precisiones = [] recalles = [] for batch_eva in dataprocess.nextBatch(evalReviews, evalLabels, FLAGS.batch_size): loss, output = sess.run([cnn.loss, cnn.output], feed_dict={ cnn.input_X: batch_eva[0], cnn.input_Y: batch_eva[1] }) acc = accuracy_score(batch_eva[1], ouput) precision = precision_score(batch_eva[1], ouput, average='weighted') recall = recall_score(batch_eva[1], ouput, average='micro') acces.append(acc) precisiones.append(precision) recalles.append(recall) acc = sum(acces)/len(acces) precision = sum(precisiones)/len(precisiones) recall = sum(recalles)/len(recalles) print("验证集结果:") print("{}, iter: {}, loss: {},acc: {}, precision: {}, recall: {}" .format(timeStr, i, loss, acc, precision, recall)) if recall > recall_max: recall_max = recall print("正在保存模型") saver.save(sess, FLAGS.path_model, global_step=step)
def keyword_matching(self, query): #根据关键词匹配来返回每个query及对应的分数 words = cut(query, use_stopwords=True) # print(words) data_process = Data_process(self.id) self.datas = data_process.read_all_data_state_1() self.answer_score = {} for idx, data in enumerate(self.datas): self.answer_score[data["query"]] = [idx, 0] for word in words: for data in self.datas: if word in data["keywords"]: self.answer_score[data["query"]][1] += 1 return self.answer_score
def get_data(): id = request.args.get('id') data_process = Data_process(id) datas = data_process.read_all_data_state_1() datas_dict = {} for idx,data in enumerate(datas): _data = {} _data["query"] = data["query"] _data["keywords"] = data["keywords"] _data["answer"] = data["answer"] datas_dict[idx] = _data json_dict = { "id": id, "datas": datas_dict } return jsonify(json_dict)
with tf.Session() as sess: sess.run(init) checkpoint = tf.train.latest_checkpoint("model/lstm_model") saver.restore(sess,checkpoint) result = sess.run(prediction,feed_dict={Xs:input_feature,seq_tf_len:seq_len_list}) print(result) return result if __name__ == '__main__': input_names = ["index", "move_data", "target"] input_path = "data/dsjtzs_txfz_test_sample.txt" original_no_label = Data_process(input_path,input_names,has_label=False) input_data,input_seq_lenth = original_no_label.preprocessing_data() input_feature = feature_engineering(original_no_label,input_data,input_seq_lenth) for i in range(len(input_feature)): result = model_predict(input_feature[i],input_seq_lenth[i]) print("第{}条记录有{:.2%}概率是异常记录!".format(i + 1, result[0][1])) # tf.reset_default_graph() # lstm_model = LSTM_model() # # 定义placeholder # Xs = lstm_model.Xs # ys = lstm_model.ys # seq_len = lstm_model.seq_len # prediction = lstm_model.model_building(Xs=Xs,seq_len=seq_len,bidirection_lstm_layer=True) # # 做完特征后的数据 # input_feature = feature_engineering(original_no_label,input_data, input_seq_lenth)
from data_process import Data_process, Vocab data_process_ins = Data_process('src/class-0.txt', 'src/class-1.txt') vocab = Vocab(data_process_ins.counter.vocabulary_, 10) pass
# 对new_input_data 进行标准化 scale_input_data = [] for i in range(len(new_input_data)): seq_lenth_single = sequence_lenth[i] array_single = new_input_data[i] new_arr = original_data_process.scale_exept_0(array_single, seq_lenth_single) scale_input_data.append(new_arr) return scale_input_data if __name__ == '__main__': input_path = "data/dsjtzs_txfz_training.txt" input_names = ["index", "move_data", "target", "label"] original_data_process = Data_process(input_path, input_names) input_data, label, sequence_lenth = original_data_process.preprocessing_data( ) """划分训练集和测试集,由于之前已经shuffle过了,直接取前2500条数据作为训练集,500条作为测试集""" # 做完特征后的数据 input_feature = feature_engineering(original_data_process, input_data, sequence_lenth) trainX = input_feature[:2500] testX = input_feature[2500:] trainY = label[:2500] testY = label[2500:] sequence_lenth_train = sequence_lenth[:2500] sequence_lenth_test = sequence_lenth[2500:] # 检查下scale_input_data 有没有nan值 list_nan = list(map(tuple, np.argwhere(np.isnan(np.array(input_feature)))))
def main(args): #show all arguments and configuration print(args) seed = None if seed is not None: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) #set log file log = sys.stdout if args.log_file is not None: log = open(args.log_file,'a') #set device device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') number_gpu = torch.cuda.device_count() if torch.cuda.is_available(): print('device is cuda and cuda number is ', number_gpu) else: print('device is cpu') #process data if not args.data_processed: process_data = Data_process(args) process_data.process() #load word, char embedding and word dictionary word_emb_tensor = torch.FloatTensor(np.array(pickle_load_large_file(args.processed_word_embedding), dtype=np.float32)) char_emb_tensor = torch.FloatTensor(np.array(pickle_load_large_file(args.processed_char_embedding), dtype=np.float32)) word2idx_dict = pickle_load_large_file(args.word_dictionary) SQuAD_train_dataset = SQuADDataset(args.train_processed_data) train_data_loader = DataLoader(dataset=SQuAD_train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0, collate_fn=collate) SQuAD_dev_dataset = SQuADDataset(args.dev_processed_data) dev_data_loader = DataLoader(dataset=SQuAD_dev_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0, collate_fn=collate) #initialize model model = QANet_Unanswerable(word_emb_tensor, char_emb_tensor, args.model_dim, num_heads = args.num_heads, train_char_emb = args.char_emb_pretrained, pad=word2idx_dict['<PAD>']) model.summary() if torch.cuda.device_count() > 1 and args.multi_gpu: model = nn.DataParallel(model) model.to(device) # exponential moving average ema = EMA(args.decay) if args.use_ema: for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) # set optimizer and scheduler #lr = args.lr #base_lr = 1 parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(params = parameters, lr = args.lr, betas = (args.beta1, args.beta2),eps = 1e-8, weight_decay = 3e-7) cr = 1.0 / math.log(args.lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else 1) #optimizer = optim.Adam(params = parameters, lr = base_lr, betas = (args.beta1, args.beta2),eps = 1e-8, weight_decay = 3e-7) #cr = lr / math.log(args.lr_warm_up_num) #scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else lr) # set loss, metrics loss = torch.nn.CrossEntropyLoss() # set visdom visualizer to store training process information # see the training process on http://localhost:8097/ vis = None if args.visualizer: os.system("python -m visdom.server") vis = Visualizer("main") # construct trainer # an identifier (prefix) for saved model identifier = type(model).__name__ + '_' trainer = Model_Trainer( args, model, loss, train_data_loader=train_data_loader, dev_data_loader=dev_data_loader, dev_eval_file=args.dev_eval_data, optimizer=optimizer, scheduler=scheduler, epochs=args.epochs, with_cuda=args.with_cuda, save_dir=args.save_dir, verbosity=args.verbosity, save_freq=args.save_freq, print_freq=args.print_freq, resume=args.resume, identifier=identifier, debug=args.debug, debug_batchnum=args.debug_batchnum, lr=args.lr, lr_warm_up_num=args.lr_warm_up_num, grad_clip=args.grad_clip, decay=args.decay, visualizer=vis, logger=log, use_scheduler=args.use_scheduler, use_grad_clip=args.use_grad_clip, use_ema=args.use_ema, ema=ema, use_early_stop=args.use_early_stop, early_stop=args.early_stop) # start training! start = datetime.now() trainer.train() print("Time of training model ", datetime.now() - start)