Example #1
0
def main(args):
    problem_fns = problems.parse_problems(args.problem, registry_problem_fns)
    for n, (k, [fn, on_list]) in enumerate(problem_fns.items(), start=1):
        if on_list:
            utils.verbose('Start processing no.{} problem [{}]'.format(n, k))
            fn.process(args)
            utils.verbose('Finish processing no.{} problem [{}]'.format(n, k))
Example #2
0
 def _build_vocab(self, data, vocab_size):
     self.words_count = self.collect_vocab(data)
     self.vocab = copy_head + list(self.words_count)
     if len(self.vocab) > vocab_size:
         self.vocab = self.vocab[:vocab_size]
     utils.verbose('real vocab: {}, final vocab: {}'.format(
         len(self.words_count), self.vocab_size))
     self.build_vocab_dict()
Example #3
0
def split_dialogues(dialogues, train_dev_ratio=10):
    random.shuffle(dialogues)
    divider = int(len(dialogues) / train_dev_ratio)
    dev_dialogues = dialogues[:divider]
    train_dialogues = dialogues[divider:]
    utils.verbose('train set #: {}'.format(len(dialogues) - divider))
    utils.verbose('dev set #: {}'.format(divider))
    return train_dialogues, dev_dialogues
Example #4
0
def merge_hparam(args):
    if args.hparam_set not in registry_hparams:
        raise ValueError('invalid high parameter set {}'.format(args.hparam_set))
    else:
        hparam = registry_hparams[args.hparam_set]
        for k, v in hparam.__dict__.items():
            if not k.startswith('_'):
                utils.verbose('add attribute {} [{}] to hparams'.format(k, v))
                setattr(args, k, v)
    return args
Example #5
0
 def __init__(self, vocab_file=None, segment='jieba'):
     self.words_count = dict()
     if vocab_file is not None:
         self.vocab = utils.read_lines(vocab_file)
         utils.verbose(
             'loading vocab from file {} with vocab_size {}'.format(
                 vocab_file, self.vocab_size))
     else:
         self.vocab = []
     self.sub_cutter = SubCutter(chinese_seg=segment)
     self.vocab_dict = dict()
     self.build_vocab_dict()
     self.PAD_ID = 0
Example #6
0
def main(args):
    problem_fns = problems.parse_problems(args.problem, registry_problem_fns)
    model = problems.parse_model(args.model, registry_models)
    for n, (k, [fn, on_list]) in enumerate(problem_fns.items(), start=1):
        if on_list:
            utils.verbose('Start processing no.{} problem [{}]'.format(n, k))
            if args.gpu_device != '':
                os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device
            config = tf.ConfigProto()
            config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory
            with tf.Session(config=config) as sess:
                fn.process(args, model(args), sess)
            utils.verbose('Finish processing no.{} problem [{}]'.format(n, k))
Example #7
0
 def collect_vocab(self, lines):
     words_count = dict()
     for n, line in enumerate(lines, start=1):
         if not n % 10:
             utils.verbose('processing no.{} lines'.format(n))
         tokens = self.sub_cutter.cut(line)
         for token in tokens:
             if token in words_count:
                 words_count[token] += 1
             elif token.startswith('{{') and token.endswith('}}'):
                 new_token = '<' + token.split(':')[0][2:] + '>'
                 if new_token in words_count:
                     words_count[new_token] += 1
                 else:
                     words_count[new_token] = 1
             else:
                 words_count[token] = 1
     words_count = sorted(words_count, key=words_count.get, reverse=True)
     return words_count
Example #8
0
def build_qa(dialogues, directory, prefix='train', mode='qaqaq'):
    q_path = os.path.join(directory, prefix + '_q.txt')
    a_path = os.path.join(directory, prefix + '_a.txt')
    counter = 0
    with open(q_path, 'w', encoding='utf-8') as fq:
        with open(a_path, 'w', encoding='utf-8') as fa:
            for dial in dialogues:
                content, sent_by = zip(*dial)
                full = ''.join(sent_by)
                for i in re.finditer(r'(?={})'.format(mode + 'a'), full):
                    question = '<s>'.join(
                        content[i.start():i.start() + len(mode)]) + '<s>'
                    answer = content[i.start() + len(mode)]
                    fq.write(question + '\n')
                    fa.write(answer + '\n')
                    counter += 1
                    if counter % 10000 == 0:
                        utils.verbose('store {} lines for {} set'.format(
                            counter, prefix))
Example #9
0
def process(hparam, model, sess):
    utils.clean_and_make_directory(hparam.model_dir)
    tokenizer = Tokenizer(hparam.vocab_file, segment=hparam.segment)
    train_batch = BaseBatch(
        tokenizer, init_helper(hparam, hparam.train_q, hparam.train_a))
    dev_batch = BaseBatch(tokenizer,
                          init_helper(hparam, hparam.dev_q, hparam.dev_a))

    sess.run(tf.global_variables_initializer())
    starter = time.time()
    saver = tf.train.Saver(pad_step_number=True)
    features = {'lowest_loss': 10, 'train_id': 0, 'dev_id': 0}
    features = reset_features(features)

    for i in range(hparam.max_steps):
        train_batch_features = train_batch.next_batch(hparam.batch_size,
                                                      features['train_id'])
        train_fetches, train_feed_dict = model.train_step(train_batch_features)
        features['train_id'] = train_batch_features['idx']
        _, train_loss, train_acc = sess.run(train_fetches,
                                            feed_dict=train_feed_dict)
        features['train_losses'].append(train_loss)
        features['train_acc'].append(train_acc)
        if i % hparam.show_steps == 0 and i:
            dev_fetches, dev_feed_dict = model.dev_step(
                dev_batch.next_batch(hparam.batch_size, features['dev_id']))
            features['dev_id'] += hparam.batch_size
            dev_loss, dev_acc = sess.run(dev_fetches, feed_dict=dev_feed_dict)
            features['dev_losses'].append(dev_loss)
            features['dev_acc'].append(dev_acc)
            speed = hparam.show_steps / (time.time() - starter)
            utils.verbose(r'        step {:05d} | train [{:.5f} {:.5f}] | '
                          r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format(
                              i, train_loss, train_acc, dev_loss, dev_acc,
                              speed))
            starter = time.time()

        if i % hparam.save_steps == 0 and i:
            features = avg_features(features)
            if features['dev_losses'] < features['lowest_loss']:
                saver.save(sess, hparam.model_path)
                features['lowest_loss'] = features['dev_losses']
            utils.verbose(r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | '
                          r'dev [{:.5f} {:.5f}]'.format(
                              i - hparam.save_steps, i,
                              features['train_losses'], features['train_acc'],
                              features['dev_losses'], features['dev_acc']))
            print('-+' * 55)
            features = reset_features(features)

        if train_batch_features['update_epoch']:
            train_batch.shuffle_data()
            if train_batch.epoch > 10:
                utils.verbose('update epoch and reorder data...')
                train_batch = reorder_batch(hparam, model, sess, train_batch)

    utils.write_result(hparam, features['lowest_loss'])
Example #10
0
 def build_vocab(self, data, vocab_size, path):
     self._build_vocab(data, vocab_size)
     utils.write_lines(path, self.vocab)
     utils.verbose('vocab has been dumped in {}'.format(
         os.path.abspath(path)))