def train(save_vocab_path='', train_path='', test_path='', train_seg_path='', test_seg_path='', model_save_dir='', vocab_max_size=5000, vocab_min_count=5, hidden_dim=512, use_cuda=False): train_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): avg_cost = train_model() optimizer = optimizer_func(hidden_dim) optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) seg_data(train_path, test_path) train_texts = build_dataset(train_seg_path) if os.path.exists(save_vocab_path): vocab = load_vocab(save_vocab_path) else: vocab, reverse_vocab = build_vocab(train_texts, min_count=vocab_min_count) write_vocab(vocab, save_vocab_path) vocab = load_vocab(save_vocab_path) train_set = read_data(train_seg_path) train_set_ids = transform_data(train_set, vocab) num_encoder_tokens = len(train_set_ids) max_input_texts_len = max([len(text) for text in train_texts]) print('num of samples:', len(train_texts)) print('num of unique input tokens:', num_encoder_tokens) print('max sequence length for inputs:', max_input_texts_len) # save_word_dict(vocab2id, save_vocab_path) train_reader = data_generator(train_set_ids) train_data = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=10000), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=['question_word', 'dialogue_word', 'report_word', 'report_next_word'], place=place, program=train_prog) exe.run(startup_prog) EPOCH_NUM = 20 for pass_id in six.moves.xrange(EPOCH_NUM): batch_id = 0 for data in train_data(): cost = exe.run(train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0] print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id, cost)) batch_id += 1 fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
def main(): with open(args.model, 'rb') as f: model = torch.load(f) if args.cuda: model.cuda() with open(args.word_path, 'rb') as f: word2id = pickle.load(f) raw_data = seg_data(args.data) transformed_data = transform_data_to_id(raw_data, word2id) data = [x + [y[2]] for x, y in zip(transformed_data, raw_data)] data = sorted(data, key=lambda x: len(x[1])) print('test data size {:d}'.format(len(data))) inference(model, data)
parser.add_argument('--batch_size', type=int, default=32, metavar='N', help='batch size') parser.add_argument('--cuda', action='store_true',default=True, help='use CUDA') args = parser.parse_args() with open(args.model, 'rb') as f: model = torch.load(f) if args.cuda: model.cuda() with open(args.word_path, 'rb') as f: word2id = cPickle.load(f) raw_data = seg_data(args.data) transformed_data = transform_data_to_id(raw_data, word2id) data = [x + [y[2]] for x, y in zip(transformed_data, raw_data)] data = sorted(data, key=lambda x: len(x[1])) print( 'test data size {:d}'.format(len(data))) def inference(): model.eval() predictions = [] with torch.no_grad(): for i in range(0, len(data), args.batch_size): # for i in range(0, len(data), 3): try: one = data[i:i + args.batch_size] # print(one)
def __init__(self, input_file_path, seg_input_file_path='', word_vocab_path='', label_vocab_path='', feature_vec_path='', model_save_path='', pred_save_path='', feature_type='tf_word', model_type='logistic', num_classes=2, col_sep='\t', min_count=1, lower_thres=0.5, upper_thres=0.85, label_ratio=0.9, label_min_size=200, batch_size=10, warmstart_size=0.02, stop_words_path='data/stop_words.txt'): self.input_file_path = input_file_path self.seg_input_file_path = seg_input_file_path if seg_input_file_path else input_file_path + "_seg" self.stop_words_path = stop_words_path self.word_vocab_path = word_vocab_path if word_vocab_path else "word_vocab.txt" self.label_vocab_path = label_vocab_path if label_vocab_path else "label_vocab.txt" self.feature_vec_path = feature_vec_path if feature_vec_path else "feature_vec.pkl" self.model_save_path = model_save_path if model_save_path else "model.pkl" self.pred_save_path = pred_save_path if pred_save_path else "predict.txt" self.feature_type = feature_type self.num_classes = num_classes self.col_sep = col_sep self.min_count = min_count self.lower_thres = lower_thres self.upper_thres = upper_thres self.label_ratio = label_ratio # 1. load segment data if not os.path.exists(self.seg_input_file_path): start_time = time() seg_data(self.input_file_path, self.seg_input_file_path, col_sep=self.col_sep, stop_words_path=self.stop_words_path) logger.info("spend time: %s s" % (time() - start_time)) self.seg_contents, self.data_lbl = data_reader( self.seg_input_file_path, self.col_sep) # 2. load original data self.content, _ = data_reader(self.input_file_path, self.col_sep) # 3. load feature word_lst = [] for i in self.seg_contents: word_lst.extend(i.split()) # word vocab self.word_vocab = build_vocab(word_lst, min_count=self.min_count, sort=True, lower=True) # save word vocab write_vocab(self.word_vocab, self.word_vocab_path) # label label_vocab = build_vocab(self.data_lbl) # save label vocab write_vocab(label_vocab, self.label_vocab_path) label_id = load_vocab(self.label_vocab_path) print("label_id: %s" % label_id) self.set_label_id(label_id) self.id_label = {v: k for k, v in label_id.items()} print('num_classes:%d' % self.num_classes) self.data_feature = self._get_feature(self.word_vocab) # 4. assemble sample DataObject self.samples = self._get_samples(self.data_feature) self.batch_num = batch_size if batch_size > 1 else batch_size * len( self.samples) self.warmstart_num = warmstart_size if warmstart_size > 1 else warmstart_size * len( self.samples) self.label_min_num = label_min_size if label_min_size > 1 else label_min_size * len( self.samples) # 5. init model self.model = get_model(model_type)
default=True, help='use CUDA') args = parser.parse_args() with open(args.model, 'rb') as f: model = torch.load(f) if args.cuda: model.cuda() print(model) with open(args.word_path, 'rb') as f: word2id = pickle.load(f) print(len(word2id)) raw_data = seg_data(args.data) transformed_data = transform_data_to_id(raw_data, word2id) data = [x + [y[2]] for x, y in zip(transformed_data, raw_data)] data = sorted(data, key=lambda x: len(x[1])) print('test data size {:d}'.format(len(data))) raw_data_valid = seg_data(args.valid_data) transformed_data_valid = transform_data_to_id(raw_data_valid, word2id) dev_data = [x + [y[2]] for x, y in zip(transformed_data_valid, raw_data_valid)] dev_data = sorted(dev_data, key=lambda x: len(x[1])) print('valid data size {:d}'.format(len(dev_data))) def inference(): model.eval() predictions = []