def main(): Config = config.get_args() set_seed(Config.seed) word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path) test_data = CommentDataSet(Config.test_path, word2ix, ix2word) test_loader = DataLoader( test_data, batch_size=16, shuffle=False, num_workers=0, collate_fn=mycollate_fn, ) weight = torch.zeros(len(word2ix), Config.embedding_dim) model = SentimentModel(embedding_dim=Config.embedding_dim, hidden_dim=Config.hidden_dim, LSTM_layers=Config.LSTM_layers, drop_prob=Config.drop_prob, pre_weight=weight) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = torch.device("cpu") criterion = nn.CrossEntropyLoss() model.load_state_dict(torch.load(Config.model_save_path), strict=True) # 模型加载 confuse_meter = ConfuseMeter() confuse_meter = test(test_loader, device, model, criterion)
def init_from_scratch(args, train_exs, dev_exs): """New model, new data, new dictionary.""" # Create a feature dict out of the annotations in the data logger.info('-' * 100) logger.info('Generate features') feature_dict = utils.build_feature_dict(args, train_exs) logger.info('Num features = %d' % len(feature_dict)) logger.info(feature_dict) # Build a dictionary from the data questions + documents (train/dev splits) logger.info('-' * 100) logger.info('Build word dictionary') word_dict = utils.build_word_dict(args, train_exs + dev_exs) logger.info('Num words = %d' % len(word_dict)) # Build a char dictionary from the data questions + documents (train/dev splits) logger.info('-' * 100) logger.info('Build char dictionary') char_dict = utils.build_char_dict(args, train_exs + dev_exs) logger.info('Num chars = %d' % len(char_dict)) # Initialize model model = DocReader(config.get_model_args(args), word_dict, char_dict, feature_dict) # Load pretrained embeddings for words in dictionary if args.embedding_file: model.load_embeddings(word_dict.tokens(), args.embedding_file) if args.char_embedding_file: model.load_char_embeddings(char_dict.tokens(), args.char_embedding_file) return model
def init_from_scratch(args, train_exs): print('init from scrath') print('building word vocabulary') word_dict = build_word_dict(args, train_exs) print('building char vocabulary') char_dict = build_char_dict(args, train_exs) model = TMmodel(args, word_dict, char_dict) model.load_word_embedding() model.load_char_embedding() return model
def __init__(self): # we treat x,y,z as OOV self.vocab_dict = build_word_dict(src_path, tgt_path) self.vocab_size = len(self.vocab_dict) # self.all_tokens['x'] = len(self.all_tokens) # self.all_tokens['y'] = len(self.all_tokens) # self.all_tokens['z'] = len(self.all_tokens) self.reverse_vocab_dict = dict( zip(self.vocab_dict.values(), self.vocab_dict.keys())) self.dataset = ToyDataset() self.tested_examples = ['cake i love']
def prepare_dataloader(word_dict=None, feature_dict=None): """Create data loaders for train and dev""" # Load examples logger.info('-' * 100) logger.info('Loading Datasets...') toyfile = 'toy-' if conf['debug'] else '' datafile = os.path.join( conf['data-dir'], 'bioasq_processed', '{}examples-y{}-train.txt'.format(toyfile, conf['year'])) train_ex = utils.load_data(datafile) logger.info('{} train examples loaded'.format(len(train_ex))) datafile = os.path.join( conf['data-dir'], 'bioasq_processed', '{}examples-y{}-test.txt'.format(toyfile, conf['year'])) test_ex = utils.load_data(datafile) logger.info('{} test examples loaded'.format(len(test_ex))) # Prepare feature_dict, word_dict if feature_dict is None: if len(conf['features']) > 0: logger.info('Building feature dictionary...') feature_dict = utils.build_feature_dict(train_ex) if conf['idf-file'] is not None and 'idf' not in feature_dict: feature_dict['idf'] = len(feature_dict) logger.info('Num features = {}'.format(len(feature_dict))) logger.info(feature_dict) if word_dict is None: logger.info('Build word dictionary...') word_dict = utils.build_word_dict(train_ex + test_ex) logger.info('Num words = %d' % len(word_dict)) conf['vocab-size'] = len(word_dict) # Prepare DataLoaders logger.info('-' * 100) logger.info('Creating DataLoaders') train_dataset = utils.QaProxDataset(conf, train_ex, word_dict, feature_dict, conf['idf-file']) train_loader_ = DataLoader(train_dataset, batch_size=conf['batch-size'], sampler=sampler.RandomSampler(train_dataset), collate_fn=utils.batchify, num_workers=conf['num-workers'], pin_memory=conf['cuda']) dev_dataset = utils.QaProxDataset(conf, test_ex, word_dict, feature_dict, conf['idf-file']) dev_loader_ = DataLoader(dev_dataset, batch_size=conf['batch-size'], sampler=sampler.RandomSampler(dev_dataset), collate_fn=utils.batchify, num_workers=conf['num-workers'], pin_memory=conf['cuda']) return train_loader_, dev_loader_, word_dict, feature_dict
def main(): Config = config.get_args() set_seed(Config.seed) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path) weight = torch.zeros(len(word2ix), Config.embedding_dim) model = SentimentModel(embedding_dim=Config.embedding_dim, hidden_dim=Config.hidden_dim, LSTM_layers=Config.LSTM_layers, drop_prob=Config.drop_prob, pre_weight=weight) model.load_state_dict(torch.load(Config.model_save_path), strict=True) # 模型加载 # comment_str = "忘不掉的一句台词,是杜邦公司笑着对男主说:“Sue me”。我记得前段时间某件事,也是同样的说辞,“欢迎来起诉中华有为”。也是同样的跋扈。若干年后,会看到改编的电影吗。" result = predict(Config.comment_str, model, device, word2ix) print(Config.comment_str, result)
def init_from_scratch(args, train_exs, dev_exs): """New model, new data, new dictionary.""" # Create a feature dict out of the annotations in the data logger.info('-' * 100) logger.info('Generate features') feature_dict = utils.build_feature_dict(args, train_exs) logger.info('Num features = %d' % len(feature_dict)) logger.info(feature_dict) # Build a dictionary from the data questions + words (train/dev splits) logger.info('-' * 100) logger.info('Build dictionary') word_dict = utils.build_word_dict(args, train_exs + dev_exs) logger.info('Num words = %d' % len(word_dict)) # Initialize model model = ParagraphRanker(config.get_model_args(args), word_dict, feature_dict) # Load pretrained embeddings for words in dictionary if args.embedding_file and not args.no_embed: model.load_embeddings(word_dict.tokens(), args.embedding_file, args.fasttext) return model
with open(args.train_file, 'r') as f: train_exs = json.load(f) #train_exs=train_exs[:100] with open(args.dev_file, 'r') as f: dev_exs = json.load(f) #dev_exs=dev_exs[:100] with open(args.test_file, 'r') as f: test_exs = json.load(f) #test_exs=test_exs[:100] # build dict feature_dict = build_feature_dict( args, train_exs ) # feature_dict['in_question']=0, ['in_question_uncased']=1,['in_question_lemma']=2,['pos=NN']=3,['pos=IN']=4,['pos=DT']=5,. word_dict = build_word_dict(args, train_exs, dev_exs + test_exs) logger.info('Num words = %d' % len(word_dict)) # -------------------------------------------------------------------------- logger.info('-' * 100) logger.info('Make data loaders') # single ex vectorized train_dataset = ReaderDataset(train_exs, args, word_dict, feature_dict, if_train=True) # sample stategy if args.sort_by_len: train_sampler = SortedBatchSampler(train_dataset.lengths(), args.batch_size,
help="RNN network depth.") parser.add_argument("--num_hidden", type=int, default=50, help="RNN network size.") parser.add_argument("--keep_prob", type=float, default=0.5, help="dropout keep prob.") parser.add_argument("--learning_rate", type=float, default=1e-3, help="learning rate.") parser.add_argument("--batch_size", type=int, default=20, help="batch size.") parser.add_argument("--num_epochs", type=int, default=30, help="number of epochs.") args = parser.parse_args() train_file = "ptb_data/ptb.train.txt" test_file = "ptb_data/ptb.test.txt" word_dict = build_word_dict(train_file) train_data = build_dataset(train_file, word_dict) test_data = build_dataset(test_file, word_dict) train(train_data, test_data, len(word_dict), args)
def main(): Config = config.get_args() set_seed(Config.seed) word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path) train_data = CommentDataSet(Config.train_path, word2ix, ix2word) train_loader = DataLoader( train_data, batch_size=16, shuffle=True, num_workers=0, collate_fn=mycollate_fn, ) validation_data = CommentDataSet(Config.validation_path, word2ix, ix2word) validation_loader = DataLoader( validation_data, batch_size=16, shuffle=True, num_workers=0, collate_fn=mycollate_fn, ) test_data = CommentDataSet(Config.test_path, word2ix, ix2word) test_loader = DataLoader( test_data, batch_size=16, shuffle=False, num_workers=0, collate_fn=mycollate_fn, ) weight = pre_weight(len(word2ix), Config.pred_word2vec_path, Config.embedding_dim, word2ix, ix2word) model = SentimentModel(embedding_dim=Config.embedding_dim, hidden_dim=Config.hidden_dim, LSTM_layers=Config.LSTM_layers, drop_prob=Config.drop_prob, pre_weight=weight) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") optimizer = optim.Adam(model.parameters(), lr=Config.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # 学习率调整 criterion = nn.CrossEntropyLoss() # 因为使用tensorboard画图会产生很多日志文件,这里进行清空操作 if os.path.exists(Config.tensorboard_path): shutil.rmtree(Config.tensorboard_path) os.mkdir(Config.tensorboard_path) for epoch in range(Config.epochs): train_loader = tqdm(train_loader) train_loader.set_description( '[%s%04d/%04d %s%f]' % ('Epoch:', epoch + 1, Config.epochs, 'lr:', scheduler.get_lr()[0])) train(epoch, Config.epochs, train_loader, device, model, criterion, optimizer, scheduler, Config.tensorboard_path) validate(epoch, validation_loader, device, model, criterion, Config.tensorboard_path) # 模型保存 if os.path.exists(Config.model_save_path) == False: os.mkdir('./modelDict/') torch.save(model.state_dict(), Config.model_save_path) confuse_meter = ConfuseMeter() confuse_meter = test(test_loader, device, model, criterion)
def train(train_data_dir, test_data_dir, word_dict_path, label_dict_path, model_save_dir): """ :params train_data_path: The path of training data, if this parameter is not specified, imdb dataset will be used to run this example :type train_data_path: str :params test_data_path: The path of testing data, if this parameter is not specified, imdb dataset will be used to run this example :type test_data_path: str :params word_dict_path: The path of word dictionary, if this parameter is not specified, imdb dataset will be used to run this example :type word_dict_path: str :params label_dict_path: The path of label dictionary, if this parameter is not specified, imdb dataset will be used to run this example :type label_dict_path: str :params model_save_dir: dir where models saved :type model_save_dir: str """ if train_data_dir is not None: assert word_dict_path and label_dict_path, ( "The parameter train_data_dir, word_dict_path, label_dict_path " "should be set at the same time.") if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) use_default_data = (train_data_dir is None) if use_default_data: logger.info(("No training data are porivided, " "use imdb to train the model.")) logger.info("Please wait to build the word dictionary ...") word_dict = reader.imdb_word_dict() train_reader = paddle.batch(paddle.reader.shuffle( lambda: reader.imdb_train(word_dict), buf_size=1000), batch_size=100) test_reader = paddle.batch(lambda: reader.imdb_test(word_dict), batch_size=100) class_num = 2 else: if word_dict_path is None or not os.path.exists(word_dict_path): logger.info(("Word dictionary is not given, the dictionary " "is automatically built from the training data.")) # build the word dictionary to map the original string-typed # words into integer-typed index build_word_dict(data_dir=train_data_dir, save_path=word_dict_path, use_col=1, cutoff_fre=0) if not os.path.exists(label_dict_path): logger.info(("Label dictionary is not given, the dictionary " "is automatically built from the training data.")) # build the label dictionary to map the original string-typed # label into integer-typed index build_label_dict(data_dir=train_data_dir, save_path=label_dict_path, use_col=0) word_dict = load_dict(word_dict_path) label_dict = load_dict(label_dict_path) class_num = len(label_dict) logger.info("Class number is : %d." % class_num) train_reader = paddle.batch(paddle.reader.shuffle( reader.train_reader(train_data_dir, word_dict, label_dict), buf_size=conf.buf_size), batch_size=conf.batch_size) if test_data_dir is not None: # here, because training and testing data share a same format, # we still use the reader.train_reader to read the testing data. test_reader = paddle.batch(paddle.reader.shuffle( reader.train_reader(test_data_dir, word_dict, label_dict), buf_size=conf.buf_size), batch_size=conf.batch_size) else: test_reader = None dict_dim = len(word_dict) logger.info("Length of word dictionary is : %d." % (dict_dim)) paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count) # create optimizer adam_optimizer = paddle.optimizer.Adam( learning_rate=conf.learning_rate, regularization=paddle.optimizer.L2Regularization( rate=conf.l2_learning_rate), model_average=paddle.optimizer.ModelAverage( average_window=conf.average_window)) # define network topology. cost, prob, label = nested_net(dict_dim, class_num, is_infer=False) # create all the trainable parameters. parameters = paddle.parameters.create(cost) # create the trainer instance. trainer = paddle.trainer.SGD(cost=cost, extra_layers=paddle.evaluator.auc( input=prob, label=label), parameters=parameters, update_equation=adam_optimizer) # feeding dictionary feeding = {"word": 0, "label": 1} def _event_handler(event): """ Define the end batch and the end pass event handler. """ if isinstance(event, paddle.event.EndIteration): if event.batch_id % conf.log_period == 0: logger.info( "Pass %d, Batch %d, Cost %f, %s\n" % (event.pass_id, event.batch_id, event.cost, event.metrics)) if isinstance(event, paddle.event.EndPass): if test_reader is not None: result = trainer.test(reader=test_reader, feeding=feeding) logger.info("Test at Pass %d, %s \n" % (event.pass_id, result.metrics)) with gzip.open( os.path.join(model_save_dir, "params_pass_%05d.tar.gz" % event.pass_id), "w") as f: trainer.save_parameter_to_tar(f) # begin training network trainer.train(reader=train_reader, event_handler=_event_handler, feeding=feeding, num_passes=conf.num_passes) logger.info("Training has finished.")