def train(save_vocab_path='',
          train_path='',
          test_path='',
          train_seg_path='',
          test_seg_path='',
          model_save_dir='',
          vocab_max_size=5000,
          vocab_min_count=5,
          hidden_dim=512,
          use_cuda=False):

    train_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            avg_cost = train_model()
            optimizer = optimizer_func(hidden_dim)
            optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

    seg_data(train_path, test_path)
    train_texts = build_dataset(train_seg_path)

    if os.path.exists(save_vocab_path):
        vocab = load_vocab(save_vocab_path)
    else:
        vocab, reverse_vocab = build_vocab(train_texts, min_count=vocab_min_count)
        write_vocab(vocab, save_vocab_path)
        vocab = load_vocab(save_vocab_path)

    train_set = read_data(train_seg_path)
    train_set_ids = transform_data(train_set, vocab)
    num_encoder_tokens = len(train_set_ids)
    max_input_texts_len = max([len(text) for text in train_texts])
    print('num of samples:', len(train_texts))
    print('num of unique input tokens:', num_encoder_tokens)
    print('max sequence length for inputs:', max_input_texts_len)
    # save_word_dict(vocab2id, save_vocab_path)

    train_reader = data_generator(train_set_ids)

    train_data = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=10000), batch_size=batch_size)

    feeder = fluid.DataFeeder(feed_list=['question_word', 'dialogue_word', 'report_word', 'report_next_word'],
                              place=place,
                              program=train_prog)

    exe.run(startup_prog)

    EPOCH_NUM = 20
    for pass_id in six.moves.xrange(EPOCH_NUM):
        batch_id = 0
        for data in train_data():
            cost = exe.run(train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0]
            print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id, cost))
            batch_id += 1
        fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
Ejemplo n.º 2
0
def main():
    with open(args.model, 'rb') as f:
        model = torch.load(f)
    if args.cuda:
        model.cuda()

    with open(args.word_path, 'rb') as f:
        word2id = pickle.load(f)

    raw_data = seg_data(args.data)
    transformed_data = transform_data_to_id(raw_data, word2id)
    data = [x + [y[2]] for x, y in zip(transformed_data, raw_data)]
    data = sorted(data, key=lambda x: len(x[1]))
    print('test data size {:d}'.format(len(data)))
    inference(model, data)
Ejemplo n.º 3
0
parser.add_argument('--batch_size', type=int, default=32, metavar='N',
                    help='batch size')
parser.add_argument('--cuda', action='store_true',default=True,
                    help='use CUDA')

args = parser.parse_args()

with open(args.model, 'rb') as f:
    model = torch.load(f)
if args.cuda:
    model.cuda()

with open(args.word_path, 'rb') as f:
    word2id = cPickle.load(f)

raw_data = seg_data(args.data)
transformed_data = transform_data_to_id(raw_data, word2id)
data = [x + [y[2]] for x, y in zip(transformed_data, raw_data)]
data = sorted(data, key=lambda x: len(x[1]))
print( 'test data size {:d}'.format(len(data)))


def inference():
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(0, len(data), args.batch_size):
#         for i in range(0, len(data), 3):
            try:
                one = data[i:i + args.batch_size]
    #             print(one)
Ejemplo n.º 4
0
    def __init__(self,
                 input_file_path,
                 seg_input_file_path='',
                 word_vocab_path='',
                 label_vocab_path='',
                 feature_vec_path='',
                 model_save_path='',
                 pred_save_path='',
                 feature_type='tf_word',
                 model_type='logistic',
                 num_classes=2,
                 col_sep='\t',
                 min_count=1,
                 lower_thres=0.5,
                 upper_thres=0.85,
                 label_ratio=0.9,
                 label_min_size=200,
                 batch_size=10,
                 warmstart_size=0.02,
                 stop_words_path='data/stop_words.txt'):
        self.input_file_path = input_file_path
        self.seg_input_file_path = seg_input_file_path if seg_input_file_path else input_file_path + "_seg"
        self.stop_words_path = stop_words_path
        self.word_vocab_path = word_vocab_path if word_vocab_path else "word_vocab.txt"
        self.label_vocab_path = label_vocab_path if label_vocab_path else "label_vocab.txt"
        self.feature_vec_path = feature_vec_path if feature_vec_path else "feature_vec.pkl"
        self.model_save_path = model_save_path if model_save_path else "model.pkl"
        self.pred_save_path = pred_save_path if pred_save_path else "predict.txt"
        self.feature_type = feature_type
        self.num_classes = num_classes
        self.col_sep = col_sep
        self.min_count = min_count
        self.lower_thres = lower_thres
        self.upper_thres = upper_thres
        self.label_ratio = label_ratio

        # 1. load segment data
        if not os.path.exists(self.seg_input_file_path):
            start_time = time()
            seg_data(self.input_file_path,
                     self.seg_input_file_path,
                     col_sep=self.col_sep,
                     stop_words_path=self.stop_words_path)
            logger.info("spend time: %s s" % (time() - start_time))
        self.seg_contents, self.data_lbl = data_reader(
            self.seg_input_file_path, self.col_sep)

        # 2. load original data
        self.content, _ = data_reader(self.input_file_path, self.col_sep)

        # 3. load feature
        word_lst = []
        for i in self.seg_contents:
            word_lst.extend(i.split())
        # word vocab
        self.word_vocab = build_vocab(word_lst,
                                      min_count=self.min_count,
                                      sort=True,
                                      lower=True)
        # save word vocab
        write_vocab(self.word_vocab, self.word_vocab_path)
        # label
        label_vocab = build_vocab(self.data_lbl)
        # save label vocab
        write_vocab(label_vocab, self.label_vocab_path)
        label_id = load_vocab(self.label_vocab_path)
        print("label_id: %s" % label_id)
        self.set_label_id(label_id)
        self.id_label = {v: k for k, v in label_id.items()}
        print('num_classes:%d' % self.num_classes)
        self.data_feature = self._get_feature(self.word_vocab)

        # 4. assemble sample DataObject
        self.samples = self._get_samples(self.data_feature)
        self.batch_num = batch_size if batch_size > 1 else batch_size * len(
            self.samples)
        self.warmstart_num = warmstart_size if warmstart_size > 1 else warmstart_size * len(
            self.samples)
        self.label_min_num = label_min_size if label_min_size > 1 else label_min_size * len(
            self.samples)

        # 5. init model
        self.model = get_model(model_type)
Ejemplo n.º 5
0
                    default=True,
                    help='use CUDA')

args = parser.parse_args()

with open(args.model, 'rb') as f:
    model = torch.load(f)
if args.cuda:
    model.cuda()
    print(model)

with open(args.word_path, 'rb') as f:
    word2id = pickle.load(f)
    print(len(word2id))

raw_data = seg_data(args.data)
transformed_data = transform_data_to_id(raw_data, word2id)
data = [x + [y[2]] for x, y in zip(transformed_data, raw_data)]
data = sorted(data, key=lambda x: len(x[1]))
print('test data size {:d}'.format(len(data)))

raw_data_valid = seg_data(args.valid_data)
transformed_data_valid = transform_data_to_id(raw_data_valid, word2id)
dev_data = [x + [y[2]] for x, y in zip(transformed_data_valid, raw_data_valid)]
dev_data = sorted(dev_data, key=lambda x: len(x[1]))
print('valid data size {:d}'.format(len(dev_data)))


def inference():
    model.eval()
    predictions = []