Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    # 模型参数
    parser.add_argument("--max_sequence_length", default=140, help="Bert input max sequence length", type=int)

    # 路径参数设置
    parser.add_argument("--train_dataset_path", default='{}/dataset/src_data/train_dataset/nCoV_100k_train.labled.csv'.format(BASE_DIR), help="Train folder")
    parser.add_argument("--test_dataset_path", default='{}/dataset/src_data/test_dataset/nCov_10k_test.csv'.format(BASE_DIR), help="Test folder")
    parser.add_argument("--test_submit_example_path", default='{}/data/test_dataset/submit_example.csv'.format(BASE_DIR), help="submit_example folder")
    parser.add_argument("--bert_pretrain_path", default='{}/dataset/bert_base_chinese/'.format(BASE_DIR), help="Bert Pretrain folder")

    # others
    parser.add_argument("--input_categories", default="微博中文内容", help="输入文本的文本内容列")
    parser.add_argument("--output_categories", default="情感倾向", help="标签列")
    parser.add_argument("--epochs", default=2, help="train epochs", type=int)
    parser.add_argument("--batch_size", default=8, help="train batch_size", type=int)

    # 交叉验证参数
    parser.add_argument("--n_splits", default=5, help="train n_splits", type=int)
    parser.add_argument("--use_cross_valid", default=True, help="是否使用交叉验证")
    parser.add_argument("--cross_dataset_path", default='{}/dataset/cross_data/'.format(BASE_DIR),
                        help="Cross valid folder")

    # 数据集分割路径参数
    parser.add_argument("--split_dataset_path", default='{}/dataset/split_data/'.format(BASE_DIR), help="Split dataset folder")

    # mode
    parser.add_argument("--mode", default='test', help="training or test options")
    parser.add_argument("--loss_type", default="focal_loss", help="loss type is focal_loss or cross_entropy")
    parser.add_argument("--learning_rate_1", default=1e-5, help="learning_rate_1")
    parser.add_argument("--learning_rate_2", default=1e-4, help="learning_rate_2 is None or 1e-4...")
    parser.add_argument("--use_different_learning_rate", default=True, help="是否使用不同的学习率")


    # checkpoint
    parser.add_argument("--model_checkpoint_dir", default='{}/ckpt'.format(BASE_DIR), help="Model folder")

    args = parser.parse_args()
    params = vars(args)

    gpus = tf.config.experimental.list_physical_devices(device_type='GPU')

    if gpus:
        tf.config.experimental.set_visible_devices(devices=gpus[0], device_type='GPU')

    if params["mode"] == "train":
        train(params)
    elif params["mode"] == "test":
        test(params)
Esempio n. 2
0
def main_train(data_path):

    data_text, data_tag = load_tag_file_raw_data(data_path)
    data_set, word_vocab = get_tag_file_dataset(data_text, data_tag)

    with open(VOCAB_PATH, 'wb') as fw:
        pickle.dump(word_vocab, fw)

    NWORDS = len(word_vocab)

    encoder = Encoder(NWORDS, EMBED_SIZE, WORD_HIDDEN_SIZE, WORD_NLAYERS,
                      SENTENCE_HIDDEN_SIZE, SENTENCE_NLAYERS, NDOC_DIMS)
    sent_rnn = SentenceRecurrent(SENTENCE_HIDDEN_SIZE)

    train(encoder, sent_rnn, data_set, LR, BATCH_SIZE, N_EPOCHS, word_vocab,
          CTX)
    # Set mxnet random number seed
    mx.random.seed(args.seed)

    # set the useful of gpu or cpu
    ctx = try_gpu()

    # get train and valid dataloader
    train_dataloader, valid_dataloader, vocab = build_dataloader(args)

    # build model
    model = build_model(vocab, args)

    # build loss, trainer and class_weight
    loss, trainer, class_weight = build_loss_optimizer(model, args, ctx)

    # train
    nepochs = args.nepochs
    penalization_coeff = args.penalization_coeff
    clip = args.clip
    loss_name = args.loss_name
    model_root = args.model_root
    model_name = args.model_name
    log_interval = args.log_interval
    lr_decay_step = args.lr_decay_step
    lr_decay_rate = args.lr_decay_rate
    th.train(train_dataloader, valid_dataloader, model, loss, trainer, ctx,
             nepochs, penalization_coeff, clip, class_weight, loss_name,
             model_name, model_root, log_interval, lr_decay_step,
             lr_decay_rate)
        dictionary = Dictionary()
        dictionary.build_dictionary(data)
        del data
        joblib.dump(dictionary, config.root_path + '/model/vocab.bin')
    else:
        dictionary = joblib.load(args.dictionary)
    if not args.model.isupper():
        tokenizer = config.tokenizer
    else:
        tokenizer = None

    logger.info('Making dataset & dataloader...')
    ### TODO
    # 1. 使用自定义的MyDataset, 创建DataLoader
    train_dataset =
    train_dataloader =
    dev_dataset =
    dev_dataloader =
    test_dataset =
    test_dataloader =

    # train

    #     conf.n_vocab = dictionary.max_vocab_size
    model = x.Model(config).to(config.device)
    if model_name != 'Transformer':
        init_network(model)
    print(model.parameters)

    train(config, model, train_dataloader, dev_dataloader, test_dataloader)
Esempio n. 5
0
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    ecs = []
    dcs = []
    eca = 0
    dca = 0

    while epoch < n_epochs:
        epoch += 1
        input_batches, input_lengths, target_batches, target_lengths = dh.random_batch(
            batch_size, pairs, input_lang, target_lang)
        loss, ec, dc = th.train(input_batches, input_lengths, target_batches,
                                target_lengths, encoder, decoder,
                                encoder_optimizer, decoder_optimizer,
                                train_conf)
        print_loss_total += loss
        plot_loss_total += loss
        eca += ec
        dca += dc

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (sh.time_since(
                start,
                float(epoch) / n_epochs), epoch, epoch / n_epochs * 100,
                                                   print_loss_avg)
            print(print_summary)
Esempio n. 6
0
def main(model_saved_path, model_name):
    ne_cate_dic = Configuer.ne_cate_dic
    word_path = Configuer.word_path
    label_path = Configuer.label_path
    nature_path = Configuer.nature_path

    X_path = Configuer.X_path
    y_path = Configuer.y_path
    nature_py_path = Configuer.nature_py_path
    word_vocab_path = Configuer.word_vocab_path
    label_vocab_path = Configuer.label_vocab_path
    nature_vocab_path = Configuer.nature_vocab_path

    max_seq_len = Configuer.MAX_SEQ_LEN
    pad = Configuer.PAD
    pad_nature = Configuer.PAD_NATURE
    unk = Configuer.UNK
    not_ne = Configuer.NOT

    # 从本地读取数据
    if os.path.exists(word_vocab_path) and os.path.exists(label_vocab_path)\
            and os.path.exists(nature_vocab_path) and os.path.exists(X_path)\
            and os.path.exists(y_path) and os.path.exists(nature_py_path):
        print('Loading existed data...')
        with open(word_vocab_path,
                  'rb') as f1, open(label_vocab_path,
                                    'rb') as f2, open(nature_vocab_path,
                                                      'rb') as f3:
            word_vocab = pickle.load(f1)
            label_vocab = pickle.load(f2)
            nature_vocab = pickle.load(f3)
        data_x, data_y, data_nature = np.load(X_path), np.load(
            y_path), np.load(nature_py_path)
        print('Loading end!')
    else:
        # 转换文本数据到 numpy数据 和 pickle 数据
        print('Converting data from scratch...')
        word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs = read_data(
            word_path, label_path, nature_path, max_seq_len, pad, not_ne,
            pad_nature, unk)
        data_x, data_y, data_nature = convert_txt_data(
            X_path, y_path, nature_py_path, input_seqs, output_seqs,
            nature_seqs, word_vocab, label_vocab, nature_vocab, max_seq_len,
            unk)
        with open(word_vocab_path,
                  'wb') as fw1, open(label_vocab_path, 'wb') as fw2, open(
                      nature_vocab_path, 'wb') as fw3:
            pickle.dump(word_vocab, fw1)
            pickle.dump(label_vocab, fw2)
            pickle.dump(nature_vocab, fw3)
        np.save(X_path, data_x)
        np.save(y_path, data_y)
        np.save(nature_py_path, data_nature)
        print('Converting end!')

    # 切分训练集和验证集
    X_train, X_valid, Y_train, Y_valid, nature_train, nature_valid = train_test_split(
        data_x, data_y, data_nature, test_size=0.1, random_state=33)
    print(X_train.shape, X_valid.shape)
    # X_train = X_train[0:512]
    # nature_train = nature_train[0:512]
    # Y_train = Y_train[0:512]
    # X_valid = X_valid[0:512]
    # nature_valid = nature_valid[0:512]
    # Y_valid = Y_valid[0:512]
    dataset_train = ArrayDataset(nd.array(X_train, ctx=CTX),
                                 nd.array(nature_train, ctx=CTX),
                                 nd.array(Y_train, ctx=CTX))
    data_iter_train = DataLoader(dataset_train,
                                 batch_size=256,
                                 shuffle=True,
                                 last_batch='rollover')
    dataset_valid = ArrayDataset(nd.array(X_valid, ctx=CTX),
                                 nd.array(nature_valid, ctx=CTX),
                                 nd.array(Y_valid, ctx=CTX))
    data_iter_valid = DataLoader(dataset_valid, batch_size=256, shuffle=False)

    # 根据参数配置模型
    model, loss = None, None
    word_vocab_size, word_vec_size = len(word_vocab), 300
    nature_vocab_size, nature_vec_size = len(nature_vocab), 50
    drop_prob = 0.3
    num_epochs = 20
    lr = 0.0001

    if model_name == 'lstm_crf':
        print('train lstm_crf model')
        hidden_dim = 128
        num_layers = 2
        tag2idx = label_vocab.token_to_idx
        model = LSTM_CRF(word_vocab_size, word_vec_size, nature_vocab_size,
                         nature_vec_size, hidden_dim, num_layers, tag2idx,
                         drop_prob)
        model.initialize(init=init.Xavier(), ctx=CTX)
        loss = model.crf.neg_log_likelihood
    elif model_name == 'cnn_crf':
        pass
    elif model_name == 'cnn':
        pass

    trainer = gluon.Trainer(model.collect_params(), 'adam',
                            {'learning_rate': lr})

    # 开始训练
    print('waiting...')
    print(model)
    th.train(data_iter_train, data_iter_valid, model, loss, trainer, CTX,
             num_epochs, word_vocab, label_vocab, max_seq_len, ne_cate_dic)

    # 保存模型参数
    model.save_parameters(model_saved_path)
    print(model_name + 'model params has saved in :',
          os.path.abspath(model_saved_path))
Esempio n. 7
0
def logger_0(string):
    LOG_FILE.write(string + "\n")


def logger_1(string):
    LOG_FILE.write(string + "\n")
    print(string)


FLAGS = parser.parse_args()
assert FLAGS.mode and FLAGS.save_folder, "Must specify mode and save foler"

if not os.path.exists(FLAGS.save_folder):
    os.makedirs(FLAGS.save_folder)
LOG_FILE = open(FLAGS.save_folder + "/log", "a")

if __name__ == '__main__':

    if FLAGS.mode == "train":
        logger_1("Start Training ...")
        train(FLAGS.save_folder, FLAGS.restore_from, [logger_0, logger_1])

    elif FLAGS.mode == 'eval':
        raise NotImplementedError

    else:  # mode=test
        raise NotImplementedError

    LOG_FILE.close()
def main():
    #content_path = '../data/paper_path_content10.txt'
    #category_path = '../data/paper_category10.txt'
    content_path = '../data/papertext.txt'
    category_path = '../data/papercategory.txt'

    titles, contents, labels = get_data1(content_path, category_path)
    from collections import Counter
    print(Counter(labels))

    #方法2 用别人的词典
    max_words = 10000
    customer_embedding_path = '../data/word_embedding/sgns.baidubaike.bigram-char'  # 引入预训练的词向量
    my_vocab = get_vocab(contents, customer_embedding_path,
                         max_words)  # 调用get_vocab()返回字典
    pad_num_value = my_vocab.to_indices(PAD)  #

    # 将输入数据转为整数索引
    input_idx = sentences2idx(contents,
                              my_vocab)  # 调用sentences2idx()将句子列表转化成索引列表
    # 准备训练和验证数据迭代器
    max_seq_len = 10
    contents = pad_sequences(input_idx, max_seq_len,
                             pad_num_value)  # 进行句子填充返回补短(PAD)截取的数据

    # 构建数据集
    dataset = gluon.data.SimpleDataset(
        [[content, label] for content, label in zip(contents, labels)])
    train_dataset, valid_dataset = nlp.data.train_valid_split(
        dataset, 0.1)  # 训练集:验证集 9:1
    train_dataset_lengths = [len(data[0]) for data in train_dataset]
    print(len(train_dataset), len(valid_dataset))
    print(len(train_dataset_lengths))

    # Bucketing 与 Dataloader
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
                                          nlp.data.batchify.Stack())
    batch_sampler = nlp.data.sampler.FixedBucketSampler(train_dataset_lengths,
                                                        batch_size=32,
                                                        num_buckets=10,
                                                        ratio=0.5,
                                                        shuffle=True)
    train_dataloader = gluon.data.DataLoader(train_dataset,
                                             batch_sampler=batch_sampler,
                                             batchify_fn=batchify_fn)
    valid_dataloader = gluon.data.DataLoader(valid_dataset,
                                             batch_size=32,
                                             shuffle=False,
                                             batchify_fn=batchify_fn)

    # 设置模型超参数并构建模型
    vocab_size = len(my_vocab)  #词典长度
    word_vec_size = 300  #词向量维度
    nhidden_units = 128  #一层神经元个数
    nlayers = 2  #隐藏层层数
    drop_prob = 0.3  #梯度丢失
    nclass = 3  #分3类,输出结点设置为3

    model = MyBiLSTM(vocab_size, word_vec_size, nhidden_units, nlayers,
                     drop_prob, nclass)
    model.initialize(init=init.Xavier(), ctx=CTX)
    model.hybridize()
    # Attach a pre-trained glove word vector to the embedding layer
    model.embedding_layer.weight.set_data(my_vocab.embedding.idx_to_vec)
    # fixed the layer
    model.embedding_layer.collect_params().setattr('grad_req', 'null')

    # 定义损失函数与优化器
    nepochs, lr = 10, 0.001
    loss = WeightedSoftmaxCE()
    class_weight = nd.array([1, 1, 1], ctx=CTX)

    trainer = gluon.Trainer(model.collect_params(), 'adam',
                            {'learning_rate': lr})

    # 训练
    th.train(train_dataloader,
             valid_dataloader,
             model,
             loss,
             class_weight,
             trainer,
             CTX,
             nepochs,
             clip=5.0)

    # 保存模型
    model_path = '../models/bi_lstm/bi_lstm_model'
    model.export(model_path)
    print('训练完成,模型已保存到: ', model_path)
Esempio n. 9
0
                       action='store',
                       type=int,
                       default=1024)
my_parser.add_argument('--epochs', action='store', type=int, default=1)
my_parser.add_argument('--gpu', action='store_true')

# Execute the parse_args() method
args = my_parser.parse_args()

input_path = args.dataDirectory
save_dir = args.save_dir
arch = args.arch
hidden_units = args.hidden_units
epochs = args.epochs
gpu = args.gpu
learning_rate = args.learning_rate
print(args)

if not os.path.isdir(input_path):
    print('The path specified does not exist')
    sys.exit()

model, train_data, optimizer = train_helper.train(input_path, arch,
                                                  hidden_units, epochs, gpu,
                                                  learning_rate)

if model == error.UNSUPPORTED_ARCH_ERROR:
    print("[ERROR] Unsupported arch is entered")
else:
    train_helper.save_model(model, train_data, optimizer, save_dir)
Esempio n. 10
0
parser = argparse.ArgumentParser(description="Network settings for training")
parser.add_argument('data_dir', type=str)
parser.add_argument('--save_dir', type=str, default='./checkpoint.pth')
parser.add_argument('--arch', type=str, action="store", default="vgg16")
parser.add_argument('--learning_rate', type=int, action="store", default=0.001)
parser.add_argument('--hidden_units', type=int, action="store", default=512)
parser.add_argument('--epochs', type=int, action="store", default=1)
parser.add_argument('--gpu', action="store_true", default=False)

# setting values data loading
args = parser.parse_args()

# Process the data
trainloader, validloader, testloader, class_to_idx = train_helper.process_data(
    args.data_dir)

# Create the model
model = train_helper.create_model(arch=args.arch,
                                  hidden_units=args.hidden_units)

# Train the model
model = train_helper.train(model,
                           trainloader,
                           validloader,
                           lr=args.learning_rate,
                           epochs=args.epochs,
                           gpu=args.gpu)

# Save the model
train_helper.save_model(model, class_to_idx, args.arch, save_loc=args.save_dir)
Esempio n. 11
0
save_dir = args.save_dir if args.save_dir else 'checkpoint.pth'

model.classifier = train_helper.Network(input_size=num_input,
                                        output_size=num_output,
                                        hidden_layers=hidden_layers)
print(
    "This is the model:.........................................................................."
)
print(model)

# Define hyper parameters
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate)

# Train the network
train_helper.train(model, trainloader, testloader, criterion, optimizer,
                   epochs, gpu)

# Validation with model made directly i.e not loaded from file
train_helper.validation_pass(model, validationloader, criterion)

# Done: Save the checkpoint
checkpoint = {
    'model_arch': arch,
    'input_size': model.classifier.hidden_layers[0].in_features,
    'output_size': len(class_to_idx),
    'hidden_layers':
    [each.out_features for each in model.classifier.hidden_layers],
    'state_dict': model.state_dict(),
    'class_to_idx': class_to_idx
}
Esempio n. 12
0
# Get data
batch_size = 64
d = get_IMDB(batch_size=batch_size, device=device, flag_use_pretrained=True)

# ======================================
# Sentence Representation using CBOW
clf_cbow = Classifier(sr_model=CBOW,
                      output_dim=2,
                      vocab_size=d.vocab_size,
                      embed_dim=d.embed_dim)
clf_cbow.to(device)
clf_cbow.sr_model.embeddings.weight.data.copy_(d.embeddings)

train(model=clf_cbow,
      train_iter=d.train_iter,
      test_iter=d.test_iter,
      n_epoch=5,
      lr=0.001)

# ======================================
# Sentence Representation using RN
clf_rn = Classifier(sr_model=RN,
                    output_dim=2,
                    vocab_size=d.vocab_size,
                    embed_dim=d.embed_dim,
                    max_len=100)
clf_rn.to(device)
clf_rn.sr_model.embeddings.weight.data.copy_(d.embeddings)

train(model=clf_rn,
      train_iter=d.train_iter,
Esempio n. 13
0
import sys

print("Init..")
model_name = "model.ckpt"
epochs = 2
if "e" in sys.argv:
    epochs = int(sys.argv[sys.argv.index("e")+1])
files = []
files.append("mute")
files.append("volume")
files.append("channel")
print("Files: " + ", ".join(files))

print("Loading data..")
inputs, outputs, words = load.load_data(files)

if "t" in sys.argv:
    print("Setup train..")
    sess = tf.InteractiveSession()
    x, y, y_ = th.setup(len(words), len(files))
    train_step, writer, merged, accuracy = th.trainSetup(y, y_, sess)

    print("Train..")
    th.train(inputs, outputs, x, y_, train_step, sess, epochs, writer, merged, accuracy)

    print("Save..")
    th.save(sess, model_name)
else:
    print("Test..")
    test.test(model_name, words, files)
    dev_dataset = MyDataset(dev_file,
                            dictionary,
                            args.max_length,
                            tokenizer=tokenizer,
                            word=args.word)
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=batch_size,
                                shuffle=True,
                                drop_last=True,
                                collate_fn=collate_fn)
    test_dataset = MyDataset(test_file,
                             dictionary,
                             args.max_length,
                             tokenizer=tokenizer,
                             word=args.word)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 drop_last=True,
                                 collate_fn=collate_fn)

    # train

    #     conf.n_vocab = dictionary.max_vocab_size
    model = x.Model().to(device)
    if model_name != 'Transformer':
        init_network(model)
    print(model.parameters)

    train(model, train_dataloader, dev_dataloader, test_dataloader)