def main(): parser = argparse.ArgumentParser() # 模型参数 parser.add_argument("--max_sequence_length", default=140, help="Bert input max sequence length", type=int) # 路径参数设置 parser.add_argument("--train_dataset_path", default='{}/dataset/src_data/train_dataset/nCoV_100k_train.labled.csv'.format(BASE_DIR), help="Train folder") parser.add_argument("--test_dataset_path", default='{}/dataset/src_data/test_dataset/nCov_10k_test.csv'.format(BASE_DIR), help="Test folder") parser.add_argument("--test_submit_example_path", default='{}/data/test_dataset/submit_example.csv'.format(BASE_DIR), help="submit_example folder") parser.add_argument("--bert_pretrain_path", default='{}/dataset/bert_base_chinese/'.format(BASE_DIR), help="Bert Pretrain folder") # others parser.add_argument("--input_categories", default="微博中文内容", help="输入文本的文本内容列") parser.add_argument("--output_categories", default="情感倾向", help="标签列") parser.add_argument("--epochs", default=2, help="train epochs", type=int) parser.add_argument("--batch_size", default=8, help="train batch_size", type=int) # 交叉验证参数 parser.add_argument("--n_splits", default=5, help="train n_splits", type=int) parser.add_argument("--use_cross_valid", default=True, help="是否使用交叉验证") parser.add_argument("--cross_dataset_path", default='{}/dataset/cross_data/'.format(BASE_DIR), help="Cross valid folder") # 数据集分割路径参数 parser.add_argument("--split_dataset_path", default='{}/dataset/split_data/'.format(BASE_DIR), help="Split dataset folder") # mode parser.add_argument("--mode", default='test', help="training or test options") parser.add_argument("--loss_type", default="focal_loss", help="loss type is focal_loss or cross_entropy") parser.add_argument("--learning_rate_1", default=1e-5, help="learning_rate_1") parser.add_argument("--learning_rate_2", default=1e-4, help="learning_rate_2 is None or 1e-4...") parser.add_argument("--use_different_learning_rate", default=True, help="是否使用不同的学习率") # checkpoint parser.add_argument("--model_checkpoint_dir", default='{}/ckpt'.format(BASE_DIR), help="Model folder") args = parser.parse_args() params = vars(args) gpus = tf.config.experimental.list_physical_devices(device_type='GPU') if gpus: tf.config.experimental.set_visible_devices(devices=gpus[0], device_type='GPU') if params["mode"] == "train": train(params) elif params["mode"] == "test": test(params)
def main_train(data_path): data_text, data_tag = load_tag_file_raw_data(data_path) data_set, word_vocab = get_tag_file_dataset(data_text, data_tag) with open(VOCAB_PATH, 'wb') as fw: pickle.dump(word_vocab, fw) NWORDS = len(word_vocab) encoder = Encoder(NWORDS, EMBED_SIZE, WORD_HIDDEN_SIZE, WORD_NLAYERS, SENTENCE_HIDDEN_SIZE, SENTENCE_NLAYERS, NDOC_DIMS) sent_rnn = SentenceRecurrent(SENTENCE_HIDDEN_SIZE) train(encoder, sent_rnn, data_set, LR, BATCH_SIZE, N_EPOCHS, word_vocab, CTX)
# Set mxnet random number seed mx.random.seed(args.seed) # set the useful of gpu or cpu ctx = try_gpu() # get train and valid dataloader train_dataloader, valid_dataloader, vocab = build_dataloader(args) # build model model = build_model(vocab, args) # build loss, trainer and class_weight loss, trainer, class_weight = build_loss_optimizer(model, args, ctx) # train nepochs = args.nepochs penalization_coeff = args.penalization_coeff clip = args.clip loss_name = args.loss_name model_root = args.model_root model_name = args.model_name log_interval = args.log_interval lr_decay_step = args.lr_decay_step lr_decay_rate = args.lr_decay_rate th.train(train_dataloader, valid_dataloader, model, loss, trainer, ctx, nepochs, penalization_coeff, clip, class_weight, loss_name, model_name, model_root, log_interval, lr_decay_step, lr_decay_rate)
dictionary = Dictionary() dictionary.build_dictionary(data) del data joblib.dump(dictionary, config.root_path + '/model/vocab.bin') else: dictionary = joblib.load(args.dictionary) if not args.model.isupper(): tokenizer = config.tokenizer else: tokenizer = None logger.info('Making dataset & dataloader...') ### TODO # 1. 使用自定义的MyDataset, 创建DataLoader train_dataset = train_dataloader = dev_dataset = dev_dataloader = test_dataset = test_dataloader = # train # conf.n_vocab = dictionary.max_vocab_size model = x.Model(config).to(config.device) if model_name != 'Transformer': init_network(model) print(model.parameters) train(config, model, train_dataloader, dev_dataloader, test_dataloader)
start = time.time() plot_losses = [] print_loss_total = 0 plot_loss_total = 0 ecs = [] dcs = [] eca = 0 dca = 0 while epoch < n_epochs: epoch += 1 input_batches, input_lengths, target_batches, target_lengths = dh.random_batch( batch_size, pairs, input_lang, target_lang) loss, ec, dc = th.train(input_batches, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer, decoder_optimizer, train_conf) print_loss_total += loss plot_loss_total += loss eca += ec dca += dc if epoch % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print_summary = '%s (%d %d%%) %.4f' % (sh.time_since( start, float(epoch) / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg) print(print_summary)
def main(model_saved_path, model_name): ne_cate_dic = Configuer.ne_cate_dic word_path = Configuer.word_path label_path = Configuer.label_path nature_path = Configuer.nature_path X_path = Configuer.X_path y_path = Configuer.y_path nature_py_path = Configuer.nature_py_path word_vocab_path = Configuer.word_vocab_path label_vocab_path = Configuer.label_vocab_path nature_vocab_path = Configuer.nature_vocab_path max_seq_len = Configuer.MAX_SEQ_LEN pad = Configuer.PAD pad_nature = Configuer.PAD_NATURE unk = Configuer.UNK not_ne = Configuer.NOT # 从本地读取数据 if os.path.exists(word_vocab_path) and os.path.exists(label_vocab_path)\ and os.path.exists(nature_vocab_path) and os.path.exists(X_path)\ and os.path.exists(y_path) and os.path.exists(nature_py_path): print('Loading existed data...') with open(word_vocab_path, 'rb') as f1, open(label_vocab_path, 'rb') as f2, open(nature_vocab_path, 'rb') as f3: word_vocab = pickle.load(f1) label_vocab = pickle.load(f2) nature_vocab = pickle.load(f3) data_x, data_y, data_nature = np.load(X_path), np.load( y_path), np.load(nature_py_path) print('Loading end!') else: # 转换文本数据到 numpy数据 和 pickle 数据 print('Converting data from scratch...') word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs = read_data( word_path, label_path, nature_path, max_seq_len, pad, not_ne, pad_nature, unk) data_x, data_y, data_nature = convert_txt_data( X_path, y_path, nature_py_path, input_seqs, output_seqs, nature_seqs, word_vocab, label_vocab, nature_vocab, max_seq_len, unk) with open(word_vocab_path, 'wb') as fw1, open(label_vocab_path, 'wb') as fw2, open( nature_vocab_path, 'wb') as fw3: pickle.dump(word_vocab, fw1) pickle.dump(label_vocab, fw2) pickle.dump(nature_vocab, fw3) np.save(X_path, data_x) np.save(y_path, data_y) np.save(nature_py_path, data_nature) print('Converting end!') # 切分训练集和验证集 X_train, X_valid, Y_train, Y_valid, nature_train, nature_valid = train_test_split( data_x, data_y, data_nature, test_size=0.1, random_state=33) print(X_train.shape, X_valid.shape) # X_train = X_train[0:512] # nature_train = nature_train[0:512] # Y_train = Y_train[0:512] # X_valid = X_valid[0:512] # nature_valid = nature_valid[0:512] # Y_valid = Y_valid[0:512] dataset_train = ArrayDataset(nd.array(X_train, ctx=CTX), nd.array(nature_train, ctx=CTX), nd.array(Y_train, ctx=CTX)) data_iter_train = DataLoader(dataset_train, batch_size=256, shuffle=True, last_batch='rollover') dataset_valid = ArrayDataset(nd.array(X_valid, ctx=CTX), nd.array(nature_valid, ctx=CTX), nd.array(Y_valid, ctx=CTX)) data_iter_valid = DataLoader(dataset_valid, batch_size=256, shuffle=False) # 根据参数配置模型 model, loss = None, None word_vocab_size, word_vec_size = len(word_vocab), 300 nature_vocab_size, nature_vec_size = len(nature_vocab), 50 drop_prob = 0.3 num_epochs = 20 lr = 0.0001 if model_name == 'lstm_crf': print('train lstm_crf model') hidden_dim = 128 num_layers = 2 tag2idx = label_vocab.token_to_idx model = LSTM_CRF(word_vocab_size, word_vec_size, nature_vocab_size, nature_vec_size, hidden_dim, num_layers, tag2idx, drop_prob) model.initialize(init=init.Xavier(), ctx=CTX) loss = model.crf.neg_log_likelihood elif model_name == 'cnn_crf': pass elif model_name == 'cnn': pass trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr}) # 开始训练 print('waiting...') print(model) th.train(data_iter_train, data_iter_valid, model, loss, trainer, CTX, num_epochs, word_vocab, label_vocab, max_seq_len, ne_cate_dic) # 保存模型参数 model.save_parameters(model_saved_path) print(model_name + 'model params has saved in :', os.path.abspath(model_saved_path))
def logger_0(string): LOG_FILE.write(string + "\n") def logger_1(string): LOG_FILE.write(string + "\n") print(string) FLAGS = parser.parse_args() assert FLAGS.mode and FLAGS.save_folder, "Must specify mode and save foler" if not os.path.exists(FLAGS.save_folder): os.makedirs(FLAGS.save_folder) LOG_FILE = open(FLAGS.save_folder + "/log", "a") if __name__ == '__main__': if FLAGS.mode == "train": logger_1("Start Training ...") train(FLAGS.save_folder, FLAGS.restore_from, [logger_0, logger_1]) elif FLAGS.mode == 'eval': raise NotImplementedError else: # mode=test raise NotImplementedError LOG_FILE.close()
def main(): #content_path = '../data/paper_path_content10.txt' #category_path = '../data/paper_category10.txt' content_path = '../data/papertext.txt' category_path = '../data/papercategory.txt' titles, contents, labels = get_data1(content_path, category_path) from collections import Counter print(Counter(labels)) #方法2 用别人的词典 max_words = 10000 customer_embedding_path = '../data/word_embedding/sgns.baidubaike.bigram-char' # 引入预训练的词向量 my_vocab = get_vocab(contents, customer_embedding_path, max_words) # 调用get_vocab()返回字典 pad_num_value = my_vocab.to_indices(PAD) # # 将输入数据转为整数索引 input_idx = sentences2idx(contents, my_vocab) # 调用sentences2idx()将句子列表转化成索引列表 # 准备训练和验证数据迭代器 max_seq_len = 10 contents = pad_sequences(input_idx, max_seq_len, pad_num_value) # 进行句子填充返回补短(PAD)截取的数据 # 构建数据集 dataset = gluon.data.SimpleDataset( [[content, label] for content, label in zip(contents, labels)]) train_dataset, valid_dataset = nlp.data.train_valid_split( dataset, 0.1) # 训练集:验证集 9:1 train_dataset_lengths = [len(data[0]) for data in train_dataset] print(len(train_dataset), len(valid_dataset)) print(len(train_dataset_lengths)) # Bucketing 与 Dataloader batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(), nlp.data.batchify.Stack()) batch_sampler = nlp.data.sampler.FixedBucketSampler(train_dataset_lengths, batch_size=32, num_buckets=10, ratio=0.5, shuffle=True) train_dataloader = gluon.data.DataLoader(train_dataset, batch_sampler=batch_sampler, batchify_fn=batchify_fn) valid_dataloader = gluon.data.DataLoader(valid_dataset, batch_size=32, shuffle=False, batchify_fn=batchify_fn) # 设置模型超参数并构建模型 vocab_size = len(my_vocab) #词典长度 word_vec_size = 300 #词向量维度 nhidden_units = 128 #一层神经元个数 nlayers = 2 #隐藏层层数 drop_prob = 0.3 #梯度丢失 nclass = 3 #分3类,输出结点设置为3 model = MyBiLSTM(vocab_size, word_vec_size, nhidden_units, nlayers, drop_prob, nclass) model.initialize(init=init.Xavier(), ctx=CTX) model.hybridize() # Attach a pre-trained glove word vector to the embedding layer model.embedding_layer.weight.set_data(my_vocab.embedding.idx_to_vec) # fixed the layer model.embedding_layer.collect_params().setattr('grad_req', 'null') # 定义损失函数与优化器 nepochs, lr = 10, 0.001 loss = WeightedSoftmaxCE() class_weight = nd.array([1, 1, 1], ctx=CTX) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr}) # 训练 th.train(train_dataloader, valid_dataloader, model, loss, class_weight, trainer, CTX, nepochs, clip=5.0) # 保存模型 model_path = '../models/bi_lstm/bi_lstm_model' model.export(model_path) print('训练完成,模型已保存到: ', model_path)
action='store', type=int, default=1024) my_parser.add_argument('--epochs', action='store', type=int, default=1) my_parser.add_argument('--gpu', action='store_true') # Execute the parse_args() method args = my_parser.parse_args() input_path = args.dataDirectory save_dir = args.save_dir arch = args.arch hidden_units = args.hidden_units epochs = args.epochs gpu = args.gpu learning_rate = args.learning_rate print(args) if not os.path.isdir(input_path): print('The path specified does not exist') sys.exit() model, train_data, optimizer = train_helper.train(input_path, arch, hidden_units, epochs, gpu, learning_rate) if model == error.UNSUPPORTED_ARCH_ERROR: print("[ERROR] Unsupported arch is entered") else: train_helper.save_model(model, train_data, optimizer, save_dir)
parser = argparse.ArgumentParser(description="Network settings for training") parser.add_argument('data_dir', type=str) parser.add_argument('--save_dir', type=str, default='./checkpoint.pth') parser.add_argument('--arch', type=str, action="store", default="vgg16") parser.add_argument('--learning_rate', type=int, action="store", default=0.001) parser.add_argument('--hidden_units', type=int, action="store", default=512) parser.add_argument('--epochs', type=int, action="store", default=1) parser.add_argument('--gpu', action="store_true", default=False) # setting values data loading args = parser.parse_args() # Process the data trainloader, validloader, testloader, class_to_idx = train_helper.process_data( args.data_dir) # Create the model model = train_helper.create_model(arch=args.arch, hidden_units=args.hidden_units) # Train the model model = train_helper.train(model, trainloader, validloader, lr=args.learning_rate, epochs=args.epochs, gpu=args.gpu) # Save the model train_helper.save_model(model, class_to_idx, args.arch, save_loc=args.save_dir)
save_dir = args.save_dir if args.save_dir else 'checkpoint.pth' model.classifier = train_helper.Network(input_size=num_input, output_size=num_output, hidden_layers=hidden_layers) print( "This is the model:.........................................................................." ) print(model) # Define hyper parameters criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate) # Train the network train_helper.train(model, trainloader, testloader, criterion, optimizer, epochs, gpu) # Validation with model made directly i.e not loaded from file train_helper.validation_pass(model, validationloader, criterion) # Done: Save the checkpoint checkpoint = { 'model_arch': arch, 'input_size': model.classifier.hidden_layers[0].in_features, 'output_size': len(class_to_idx), 'hidden_layers': [each.out_features for each in model.classifier.hidden_layers], 'state_dict': model.state_dict(), 'class_to_idx': class_to_idx }
# Get data batch_size = 64 d = get_IMDB(batch_size=batch_size, device=device, flag_use_pretrained=True) # ====================================== # Sentence Representation using CBOW clf_cbow = Classifier(sr_model=CBOW, output_dim=2, vocab_size=d.vocab_size, embed_dim=d.embed_dim) clf_cbow.to(device) clf_cbow.sr_model.embeddings.weight.data.copy_(d.embeddings) train(model=clf_cbow, train_iter=d.train_iter, test_iter=d.test_iter, n_epoch=5, lr=0.001) # ====================================== # Sentence Representation using RN clf_rn = Classifier(sr_model=RN, output_dim=2, vocab_size=d.vocab_size, embed_dim=d.embed_dim, max_len=100) clf_rn.to(device) clf_rn.sr_model.embeddings.weight.data.copy_(d.embeddings) train(model=clf_rn, train_iter=d.train_iter,
import sys print("Init..") model_name = "model.ckpt" epochs = 2 if "e" in sys.argv: epochs = int(sys.argv[sys.argv.index("e")+1]) files = [] files.append("mute") files.append("volume") files.append("channel") print("Files: " + ", ".join(files)) print("Loading data..") inputs, outputs, words = load.load_data(files) if "t" in sys.argv: print("Setup train..") sess = tf.InteractiveSession() x, y, y_ = th.setup(len(words), len(files)) train_step, writer, merged, accuracy = th.trainSetup(y, y_, sess) print("Train..") th.train(inputs, outputs, x, y_, train_step, sess, epochs, writer, merged, accuracy) print("Save..") th.save(sess, model_name) else: print("Test..") test.test(model_name, words, files)
dev_dataset = MyDataset(dev_file, dictionary, args.max_length, tokenizer=tokenizer, word=args.word) dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn) test_dataset = MyDataset(test_file, dictionary, args.max_length, tokenizer=tokenizer, word=args.word) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn) # train # conf.n_vocab = dictionary.max_vocab_size model = x.Model().to(device) if model_name != 'Transformer': init_network(model) print(model.parameters) train(model, train_dataloader, dev_dataloader, test_dataloader)