Ejemplo n.º 1
0
def deal_data(conn, addr):
    while True:
        # 接收1024个字节,并解码(bytes->str)
        data = conn.recv(1024).decode()
        if not data:
            break

        print('学号:', data[0:10])
        print('密码:', data[10:])

        #将得到的学号,密码,验证码发送到图书馆网站,得到数据
        dataPro = Data_process(data)
        dataPro.login(dataPro)
        print("登录成功!正在获取数据...")
        list1 = dataPro.lscx(dataPro)

        #time.sleep(100)

        #将数据返回到客户端
        for i in range(len(list1)):
            for j in range(len(list1[i])):
                conn.send(list1[i][j].encode())

        #结束符,若接收到这个数据就关闭客户端
        end = "-1"
        conn.sendall(end.encode())
        print("数据已发送")
        break

        conn.close()
        print("已断开连接的客户端对象:", addr)
        print("\n")
Ejemplo n.º 2
0
def train():
    dataprocess = Data_process(FLAGS.path_input,FLAGS.path_stopword,FLAGS.path_word2vec_model,
                               FLAGS.word2vev_size,FLAGS.max_length,FLAGS.min_counter,FLAGS.rate)
    trainReviews, trainLabels, evalReviews, evalLabels, wordembedding = dataprocess.dataGen()
    with tf.Graph().as_default():
        cnn = Bilstm(vocab_size=len(wordembedding),hidden_dims=[256,256],embedding=wordembedding,
                     hidden_size=FLAGS.hidden_size,max_length=FLAGS.max_length,dropoutKeepProb=FLAGS.dropoutrate,
                     numClass=FLAGS.numclass)
        globalStep = tf.Variable(0, name="globalStep", trainable=False)
        optimizer = tf.train.AdamOptimizer(FLAGS.learnrate)
        # 计算梯度,得到梯度和变量
        gradsAndVars = optimizer.compute_gradients(cnn.loss)
        # 将梯度应用到变量下,生成训练器,对参数进行更新
        saver = tf.train.Saver()
        trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        with tf.Session(config=tf_config) as sess:
            sess.run(tf.global_variables_initializer())
            recall_max = 0
            for i in range(FLAGS.epoch):
                for batch in dataprocess.nextBatch(trainReviews,trainLabels,FLAGS.batch_size):

                    feed_dict = {
                        cnn.input_X: batch[0],
                        cnn.input_Y: batch[1]
                    }
                    predictions,loss,_,ouput,step = sess.run([cnn.predictions,cnn.loss,trainOp,cnn.output,globalStep],
                                                        feed_dict)
                    acc = accuracy_score(batch[1], ouput)
                    precision = precision_score(batch[1], ouput, average='weighted')
                    recall = recall_score(batch[1], ouput, average='micro')
                    timeStr = datetime.datetime.now().isoformat()
                    print("{}, iter: {}, step: {}, loss: {},acc: {}, precision: {}, recall: {}"
                          .format(timeStr, i, step, loss, acc, precision, recall))
                acces = []
                precisiones = []
                recalles = []
                for batch_eva in dataprocess.nextBatch(evalReviews, evalLabels, FLAGS.batch_size):

                    loss, output = sess.run([cnn.loss, cnn.output], feed_dict={
                        cnn.input_X: batch_eva[0],
                        cnn.input_Y: batch_eva[1]
                    })
                    acc = accuracy_score(batch_eva[1], ouput)
                    precision = precision_score(batch_eva[1], ouput, average='weighted')
                    recall = recall_score(batch_eva[1], ouput, average='micro')
                    acces.append(acc)
                    precisiones.append(precision)
                    recalles.append(recall)
                acc = sum(acces)/len(acces)
                precision = sum(precisiones)/len(precisiones)
                recall = sum(recalles)/len(recalles)
                print("验证集结果:")
                print("{}, iter: {}, loss: {},acc: {}, precision: {}, recall: {}"
                      .format(timeStr, i, loss, acc, precision, recall))
                if recall > recall_max:
                    recall_max = recall
                    print("正在保存模型")
                    saver.save(sess, FLAGS.path_model, global_step=step)
    def keyword_matching(self, query):  #根据关键词匹配来返回每个query及对应的分数
        words = cut(query, use_stopwords=True)
        # print(words)

        data_process = Data_process(self.id)
        self.datas = data_process.read_all_data_state_1()

        self.answer_score = {}
        for idx, data in enumerate(self.datas):
            self.answer_score[data["query"]] = [idx, 0]

        for word in words:
            for data in self.datas:
                if word in data["keywords"]:
                    self.answer_score[data["query"]][1] += 1

        return self.answer_score
Ejemplo n.º 4
0
def get_data():
    id = request.args.get('id')

    data_process = Data_process(id)
    datas = data_process.read_all_data_state_1()

    datas_dict = {}
    for idx,data in enumerate(datas):
        _data = {}
        _data["query"] = data["query"]
        _data["keywords"] = data["keywords"]
        _data["answer"] = data["answer"]

        datas_dict[idx] = _data

    json_dict = {
        "id": id,
        "datas": datas_dict
    }
    return jsonify(json_dict)
    with tf.Session() as sess:
        sess.run(init)
        checkpoint = tf.train.latest_checkpoint("model/lstm_model")
        saver.restore(sess,checkpoint)
        result = sess.run(prediction,feed_dict={Xs:input_feature,seq_tf_len:seq_len_list})
        print(result)
        return result





if __name__ == '__main__':
    input_names = ["index", "move_data", "target"]
    input_path = "data/dsjtzs_txfz_test_sample.txt"
    original_no_label = Data_process(input_path,input_names,has_label=False)
    input_data,input_seq_lenth = original_no_label.preprocessing_data()
    input_feature = feature_engineering(original_no_label,input_data,input_seq_lenth)
    for i in range(len(input_feature)):
        result = model_predict(input_feature[i],input_seq_lenth[i])
        print("第{}条记录有{:.2%}概率是异常记录!".format(i + 1, result[0][1]))

    # tf.reset_default_graph()
    # lstm_model = LSTM_model()
    # # 定义placeholder
    # Xs = lstm_model.Xs
    # ys = lstm_model.ys
    # seq_len = lstm_model.seq_len
    # prediction = lstm_model.model_building(Xs=Xs,seq_len=seq_len,bidirection_lstm_layer=True)
    # # 做完特征后的数据
    # input_feature = feature_engineering(original_no_label,input_data, input_seq_lenth)
Ejemplo n.º 6
0
from data_process import Data_process, Vocab

data_process_ins = Data_process('src/class-0.txt', 'src/class-1.txt')
vocab = Vocab(data_process_ins.counter.vocabulary_, 10)
pass
    # 对new_input_data 进行标准化
    scale_input_data = []
    for i in range(len(new_input_data)):
        seq_lenth_single = sequence_lenth[i]
        array_single = new_input_data[i]
        new_arr = original_data_process.scale_exept_0(array_single,
                                                      seq_lenth_single)
        scale_input_data.append(new_arr)
    return scale_input_data


if __name__ == '__main__':
    input_path = "data/dsjtzs_txfz_training.txt"
    input_names = ["index", "move_data", "target", "label"]
    original_data_process = Data_process(input_path, input_names)
    input_data, label, sequence_lenth = original_data_process.preprocessing_data(
    )
    """划分训练集和测试集,由于之前已经shuffle过了,直接取前2500条数据作为训练集,500条作为测试集"""
    # 做完特征后的数据
    input_feature = feature_engineering(original_data_process, input_data,
                                        sequence_lenth)
    trainX = input_feature[:2500]
    testX = input_feature[2500:]
    trainY = label[:2500]
    testY = label[2500:]
    sequence_lenth_train = sequence_lenth[:2500]
    sequence_lenth_test = sequence_lenth[2500:]

    # 检查下scale_input_data 有没有nan值
    list_nan = list(map(tuple, np.argwhere(np.isnan(np.array(input_feature)))))
Ejemplo n.º 8
0
def main(args):
    #show all arguments and configuration
    print(args)
    seed = None
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

    #set log file
    log = sys.stdout
    if args.log_file is not None:
        log = open(args.log_file,'a')

    #set device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    number_gpu = torch.cuda.device_count()
    if torch.cuda.is_available():
        print('device is cuda and cuda number is ', number_gpu)
    else:
        print('device is cpu')

    #process data
    if not args.data_processed:
        process_data = Data_process(args)
        process_data.process()

    #load word, char embedding and word dictionary
    word_emb_tensor = torch.FloatTensor(np.array(pickle_load_large_file(args.processed_word_embedding),
                                                 dtype=np.float32))
    char_emb_tensor = torch.FloatTensor(np.array(pickle_load_large_file(args.processed_char_embedding),
                                                 dtype=np.float32))
    word2idx_dict = pickle_load_large_file(args.word_dictionary)

    SQuAD_train_dataset = SQuADDataset(args.train_processed_data)
    train_data_loader = DataLoader(dataset=SQuAD_train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0,
                                   collate_fn=collate)
    SQuAD_dev_dataset = SQuADDataset(args.dev_processed_data)
    dev_data_loader = DataLoader(dataset=SQuAD_dev_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0,
                                 collate_fn=collate)

    #initialize model
    model = QANet_Unanswerable(word_emb_tensor, char_emb_tensor, args.model_dim,
                        num_heads = args.num_heads, train_char_emb = args.char_emb_pretrained,
                        pad=word2idx_dict['<PAD>'])
    model.summary()
    if torch.cuda.device_count() > 1 and args.multi_gpu:
        model = nn.DataParallel(model)
    model.to(device)

    # exponential moving average
    ema = EMA(args.decay)
    if args.use_ema:
        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.register(name, param.data)

    # set optimizer and scheduler
    #lr = args.lr
    #base_lr = 1
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(params = parameters, lr = args.lr, betas = (args.beta1, args.beta2),eps = 1e-8, weight_decay = 3e-7)
    cr = 1.0 / math.log(args.lr_warm_up_num)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else 1)
    #optimizer = optim.Adam(params = parameters, lr = base_lr, betas = (args.beta1, args.beta2),eps = 1e-8, weight_decay = 3e-7)
    #cr = lr / math.log(args.lr_warm_up_num)
    #scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else lr)

    # set loss, metrics
    loss = torch.nn.CrossEntropyLoss()

    # set visdom visualizer to store training process information
    # see the training process on http://localhost:8097/
    vis = None
    if args.visualizer:
        os.system("python -m visdom.server")
        vis = Visualizer("main")

    # construct trainer
    # an identifier (prefix) for saved model
    identifier = type(model).__name__ + '_'
    trainer = Model_Trainer(
        args, model, loss,
        train_data_loader=train_data_loader,
        dev_data_loader=dev_data_loader,
        dev_eval_file=args.dev_eval_data,
        optimizer=optimizer,
        scheduler=scheduler,
        epochs=args.epochs,
        with_cuda=args.with_cuda,
        save_dir=args.save_dir,
        verbosity=args.verbosity,
        save_freq=args.save_freq,
        print_freq=args.print_freq,
        resume=args.resume,
        identifier=identifier,
        debug=args.debug,
        debug_batchnum=args.debug_batchnum,
        lr=args.lr,
        lr_warm_up_num=args.lr_warm_up_num,
        grad_clip=args.grad_clip,
        decay=args.decay,
        visualizer=vis,
        logger=log,
        use_scheduler=args.use_scheduler,
        use_grad_clip=args.use_grad_clip,
        use_ema=args.use_ema,
        ema=ema,
        use_early_stop=args.use_early_stop,
        early_stop=args.early_stop)

    # start training!
    start = datetime.now()
    trainer.train()
    print("Time of training model ", datetime.now() - start)