def main(): config = { 'batch_size': 128, 'emb_dim': 150, 'mem_size': 100, 'test': False, 'n_epoch': 50, 'n_hop': 6, 'n_words': None, 'lr': 0.001, 'std_dev': 0.05, 'cp_dir': 'checkpoints' } count = list() word2idx = dict() train_data = read_data('./data/ptb.train.txt', count, word2idx) valid_data = read_data('./data/ptb.valid.txt', count, word2idx) test_data = read_data('./data/ptb.test.txt', count, word2idx) config['n_words'] = len(word2idx) with tf.Session() as sess: print "Training..." mod = MemN2N(config, sess) mod.train(train_data, valid_data) mod.test(test_data)
def run(opt): # 读取训练集 train_filename_list, train_label_list = data_preprocess.read_data( directory=opt.train_directory, dir2label_dict=opt.dir2label_dict) # 定义数据增强操作 augmentation = data_preprocess.data_augmentation(opt.img_resize, opt.img_random_crop, mode='train') train_dataset = MyDataset(filenames=train_filename_list, labels=train_label_list, transform=augmentation) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True, pin_memory=True) # # 改成下面这种multi-scale 主要是自定义了batch_sample # train_loader = torch.utils.data.DataLoader(train_dataset, # batch_size=opt.batch_size, shuffle=True, # pin_memory=True, # batch_sampler= BatchSampler(RandomSampler(train_dataset), # batch_size=64, # drop_last=True, # multiscale_step=1, # img_sizes=list(range(320, 608 + 1, 32)))) # 读取测试集 test_filename_list, test_label_list = data_preprocess.read_data( directory=opt.test_directory, dir2label_dict=opt.dir2label_dict) # 定义数据增强操作 test_dataset = MyDataset(filenames=test_filename_list, labels=test_label_list, transform=augmentation) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=opt.batch_size, shuffle=True, pin_memory=True) # 改成下面这种multi-scale 主要是自定义了batch_sample # test_loader = torch.utils.data.DataLoader(test_dataset, # batch_size=opt.batch_size, shuffle=True, # pin_memory=True, # batch_sampler= BatchSampler(RandomSampler(test_dataset), # batch_size=64, # drop_last=True, # multiscale_step=1, # img_sizes=list(range(320, 608 + 1, 32)))) # val_dataset = MyDataset(filenames=val_filename_list, labels=val_label_list, transform=augmentation) # val_loader = torch.utils.data.DataLoader(val_dataset, # batch_size=opt.batch_size, shuffle=True, # pin_memory=True) # 定义一个网络 net = get_pretrain_model(opt.model_name, opt.num_classes) # 训练集上训练、测试集上测试效果,没有使用cv,将split设置为0 train.train(net, 0, train_loader, test_loader, opt)
def main(): unneeded = ['RowNumber', 'CustomerId', 'Surname'] encode = ['Geography', 'Gender'] data_set = preprocess.read_data('data_set.csv', unneeded, encode) train_x, train_y, test_x, test_y = preprocess.train_test_split( data_set, 0.8) train_x = preprocess.normalize_data(train_x) test_x = preprocess.normalize_data(test_x) if not os.path.isfile('ANN.pt'): train(train_x, train_y) train_y_pred = test(train_x) check_result(train_y_pred, train_y) test_y_pred = test(test_x) check_result(test_y_pred, test_y)
def run_cv(opt): # 读取读片,和之前不同的是,这里的训练集和验证集(测试集)在一个文件夹中,后面适用kfold随机划分训练集和验证集(测试集) filename_list, label_list = data_preprocess.read_data( directory=opt.train_directory, dir2label_dict=opt.dir2label_dict) # 分层抽样 skfold = StratifiedKFold( n_splits=opt.cv_num, shuffle=False) # random_state=0 会使得每次run_cv()的训练集和测试集分割都一样 for split, (train_index_list, val_index_list) in enumerate( skfold.split(label_list, label_list)): print('**********Split %d**********' % split) print('经过分层抽样后,训练集中的数据量为:{0},验证集中的数据量为{1}。'.format( len(train_index_list), len(val_index_list))) train_label_num_dict = tool.count_class_num(label_list, train_index_list) val_label_num_dict = tool.count_class_num(label_list, val_index_list) train_label_num_dict = sorted(train_label_num_dict.items(), key=lambda x: x[0]) val_label_num_dict = sorted(val_label_num_dict.items(), key=lambda x: x[0]) print('训练集中各个类别的数据量为: ', train_label_num_dict) print('验证集中各个类别的数据量为: ', val_label_num_dict) # 定义数据增强操作 augmentation = data_preprocess.data_augmentation(opt.img_resize, opt.img_random_crop, mode='train') # 根据分层抽样得到的数据index下标来获取训练集 train_filename_list = tool.get_index_value(value_list=filename_list, index_list=train_index_list) train_label_list = tool.get_index_value(value_list=label_list, index_list=train_index_list) train_dataset = MyDataset(filenames=train_filename_list, labels=train_label_list, transform=augmentation) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True, pin_memory=True) # 根据分层抽样得到的数据index下标来获取验证集 val_filename_list = tool.get_index_value(value_list=filename_list, index_list=val_index_list) val_label_list = tool.get_index_value(value_list=label_list, index_list=val_index_list) val_dataset = MyDataset(filenames=val_filename_list, labels=val_label_list, transform=augmentation) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opt.batch_size, shuffle=True, pin_memory=True) # 定义一个网络 net = get_pretrain_model(opt.model_name, opt.num_classes) # 训练集上训练、测试集上测试效果 train.train(net, split, train_loader, val_loader, opt)
if encoder_bidirection: exp_name += "_encoder_bidirect" else: exp_name += "_no_encoder_bidirect" # [1-2, 1-2] GRU_layers = {"encoder": 1, "decoder": 2} exp_name += "_GRU_layers_enc" + str(GRU_layers["encoder"]) + "_dec" + str(GRU_layers["decoder"]) ################# ### data read ## print("data preprocessing") # q_data longer, new_q_data shorter q_data, new_q_data, img_data = data_preprocess.read_data(root, cate) target_ids, input_ids, img_data, gen_tokenizer = tokenizer.general_preprocess(q_data, new_q_data, img_data) # print(target_ids.shape) # print(input_ids.shape) # print(len(img_data)) # quit() img_data = data_preprocess.extract_img_feat(root, cate, img_data, name=img_model_name, fc=fc_top, extract_feature=extract_feature) # target_ids, gen_tokenizer = tokenizer.general_preprocess(q_data)
""" Created on 2020/4/15 21:24 @author: phil """ import numpy as np from data_preprocess import read_data from feature_extraction import BagOfWord, NGram from softmax_regerssion import SoftmaxRegression import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split if __name__ == '__main__': debug = 1 # 读入数据 X_data, y_data = read_data() if debug == 1: # index = np.arange(len(X_data)) # np.random.shuffle(index) # X_data = X_data[index[:2000]] # y_data = y_data[index[:2000]] X_data = X_data[:1000] y_data = y_data[:1000] y = np.array(y_data).reshape(len(y_data), 1) # 数据集划分 bag_of_word_model = BagOfWord(do_lower_case=True) ngram_model = NGram(ngram=(1, 2), do_lower_case=True) X_Bow = bag_of_word_model.fit_transform(X_data) X_Gram = ngram_model.fit_transform(X_data)
from flask import * from data_preprocess import read_data, group_by_team from data_logic import first_team, sort_first_eleven, first_eleven_stats, find_similar_to_team, \ filtering_our_constraints, filtering_user_constraints import os app = Flask(__name__) app.config.update( DEBUG=True, TEMPLATES_AUTO_RELOAD=True ) app.secret_key = os.urandom(24) dat = read_data() team_data = group_by_team(dat) @app.route('/change_team', methods=['POST']) def change_team(): new_team = str(request.json['team']) session['team'] = new_team return json.dumps({'success': True}), 200, {'ContentType': 'application/json'} @app.route('/change_formation', methods=['POST']) def change_formation(): new_formation = str(request.json['formation']) session['formation'] = new_formation return json.dumps({'success': True}), 200, {'ContentType': 'application/json'} @app.route('/constraints', methods=['POST'])