Beispiel #1
0
    def _collect_vocab(self, lines):

        def insert(counter, obj):
            if obj in counter:
                counter[obj] += 1
            else:
                counter[obj] = 1
            return counter

        word_counter = dict()
        char_counter = dict()
        for n, line in enumerate(lines, start=1):
            if not n % 10000:
                utils.verbose('processing no.{} lines'.format(n))
            words = self.cutter.cut(line)
            for word in words:
                if word.startswith('{{') and word.endswith('}}'):
                    new_word = '<' + word.split(':')[0][2:] + '>'
                    word_counter = insert(word_counter, new_word)
                    char_counter = insert(char_counter, new_word)
                else:
                    word_counter = insert(word_counter, word)
                    for char in word:
                        char_counter = insert(char_counter, char)
        word_counter = sorted(word_counter, key=word_counter.get, reverse=True)
        char_counter = sorted(char_counter, key=char_counter.get, reverse=True)
        return word_counter, char_counter
Beispiel #2
0
def _reconstruct_args(args):
    hp_mode, hparams = args.hparams.split('_')
    args.tokenizer = Tokenizer

    if hparams == 'lstm':
        original = hparams_utils.lstm()
    elif hparams == 'gru':
        original = hparams_utils.gru()
    elif hparams == 'lstmln':
        original = hparams_utils.lstm_ln()
    elif hparams == 'lstmrcnn':
        original = hparams_utils.lstm_rcnn()
    else:
        raise ValueError('Unknown hparams: {}'.format(hparams))

    if hp_mode == 'solo':
        args.batch = data.SoloBatch
        args.model = model.SoloModel
        args.max_lens = [original.x_max_len, original.y_max_len]
    elif hp_mode == 'penta':
        args.batch = data.PentaBatch
        args.model = model.PentaModel
        args.max_lens = [original.y_max_len, original.y_max_len]
    else:
        raise ValueError('Unknown hp_mode: {}'.format(hp_mode))

    for k, v in original.__dict__.items():
        if not k.startswith('_'):
            utils.verbose('add attribute {} [{}] to hparams'.format(k, v))
            setattr(args, k, v)
    return args
Beispiel #3
0
def process(args):
    utils.make_directory(args.path['model'])
    tokenizer = args.tokenizer(args.path['vocab'])
    train_x = utils.read_lines(args.path['train_x'])
    train_y = utils.read_lines(args.path['train_y'])
    dataset = train_x + train_y
    keywords = None

    if args.problem == 'lda':
        model = LDAModel(args)
    else:
        trainset = [tokenizer.encode_line_into_words(i) for i in dataset]
        train_keywords(trainset, args.path['model'])
        keywords = load_keywords(args.path['model'])
        model = TFIDFModel(args)

    list_toks = []
    for n, line in enumerate(train_x):
        if not n % 10000 and n:
            utils.verbose('Tokenizing {} lines for {}'.format(n, args.problem))
        if keywords is None:
            list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line)])
        else:
            list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line)
                              if s in keywords[: args.num_keywords]])
    model.fit(list_toks)
Beispiel #4
0
def split_dialogues(dialogues, train_dev_ratio=10):
    random.shuffle(dialogues)
    divider = int(len(dialogues) / train_dev_ratio)
    dev_dialogues = dialogues[: divider]
    train_dialogues = dialogues[divider:]
    utils.verbose('train set #: {}'.format(len(dialogues) - divider))
    utils.verbose('dev set #: {}'.format(divider))
    return train_dialogues, dev_dialogues
Beispiel #5
0
 def _set_vocab(self, data, word_size, char_size):
     self.word_counter, self.char_counter = self._collect_vocab(data)
     self.words = copy_head + list(self.word_counter)[: word_size - len(copy_head)]
     self.chars = copy_head + list(self.char_counter)[: char_size - len(copy_head)]
     utils.verbose('real words: {}, final words: {}'.format(
         len(self.word_counter) + 3, len(self.words)))
     utils.verbose('real chars: {}, final chars: {}'.format(
         len(self.char_counter) + 3, len(self.chars)))
     self._set_dict()
Beispiel #6
0
def load_keywords(model_dir):
    path = os.path.join(model_dir, 'keywords.txt')
    idf_freq = {}
    utils.verbose('loading keywords from {}'.format(path))
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            word, freq = line.strip().split(' ')
            idf_freq[int(word)] = float(freq)
    keywords = sorted(idf_freq, key=idf_freq.get)
    return keywords
Beispiel #7
0
    def build_vocab(self, data, token_limits, files):
        """ Build words and chars with limited sizes and write into files

        :param data: list of lines
        :param token_limits: word_limit_size, char_limit_size
        :param files: word_file_path, char_file_path
        :return:
        """
        self._set_vocab(data, token_limits[0], token_limits[1])
        utils.write_lines(files[0], self.words)
        utils.verbose(
            'words has been dumped in {}'.format(os.path.abspath(files[0])))
        utils.write_lines(files[1], self.chars)
        utils.verbose(
            'chars has been dumped in {}'.format(os.path.abspath(files[1])))
Beispiel #8
0
def train_keywords(data, model_dir):
    path = os.path.join(model_dir, 'keywords.txt')
    vocab_counter = {}
    i = 0
    for line in data:
        for word in line:
            if word in vocab_counter:
                vocab_counter[word] += 1
            else:
                vocab_counter[word] = 1
        if not i % 10000 and i:
            utils.verbose('processing {} lines'.format(i))
        i += 1
    with open(path, 'w', encoding='utf-8') as f:
        for key, value in vocab_counter.items():
            f.write(str(key) + ' ' + str(math.log(i / value, 2)) + '\n')
    utils.verbose('keywords are saved in {}'.format(path))
Beispiel #9
0
def build_qa(dialogues, directory, prefix='train', mode='qaqaq'):
    q_path = os.path.join(directory, prefix + '_q.txt')
    a_path = os.path.join(directory, prefix + '_a.txt')
    counter = 0
    with open(q_path, 'w', encoding='utf-8') as fq:
        with open(a_path, 'w', encoding='utf-8') as fa:
            for dial in dialogues:
                content, sent_by = zip(*dial)
                full = ''.join(sent_by)
                for i in re.finditer(r'(?={})'.format(mode + 'a'), full):
                    question = '<s>'.join(content[i.start(): i.start() + len(mode)]) + '<s>'
                    answer = content[i.start() + len(mode)]
                    fq.write(question + '\n')
                    fa.write(answer + '\n')
                    counter += 1
                    if counter % 10000 == 0:
                        utils.verbose('store {} lines for {} set'.format(counter, prefix))
Beispiel #10
0
    def fit(self, list_toks):
        utils.verbose('Start training tfidf dictionary')
        self.dict = corpora.Dictionary(list_toks)

        utils.verbose('Start building tfidf corpus')
        self.corpus = [self.dict.doc2bow(toks) for toks in list_toks]

        utils.verbose('Start training tfidf model')
        self.model = models.TfidfModel(self.corpus)

        utils.verbose('Start saving tfidf dictionary and model')
        self.model.save(self.paths['model'])
        self.dict.save(self.paths['dict'])

        utils.verbose('Start building tfidf index')
        self.index = similarities.SparseMatrixSimilarity(
            self.model[self.corpus], num_features=len(self.dict.dfs))
        # self.index = similarities.MatrixSimilarity(self.model[self.corpus])
        self.index.save(self.paths['index'])
Beispiel #11
0
    def __init__(self, files=None):
        """ Char-base adding word Tokenizer

        :param files: [word_file_path, char_file_path]
        """
        self.word_counter = {}
        self.char_counter = {}
        if files is not None:
            self.words = utils.read_lines(files[0])
            self.chars = utils.read_lines(files[1])
            utils.verbose('loading words from file {} with word size {}'.format(
                files[0], self.word_size))
            utils.verbose('loading chars from file {} with char size {}'.format(
                files[1], self.char_size))
        else:
            self.words = []
            self.chars = []
        self.cutter = SubCutter()
        self.word_dict = dict()
        self.char_dict = dict()
        self._set_dict()
        self.PAD_ID = 0
        self.UNK_ID = 1
        self.EOS_ID = 2
Beispiel #12
0
 def load(self):
     if all([os.path.exists(i) for i in self.paths.values()]):
         self.model = models.LdaMulticore.load(self.paths['model'])
         utils.verbose('load lda model from {}'.format(self.paths['model']))
         self.dict = corpora.Dictionary.load(self.paths['dict'])
         utils.verbose('load lda dictionary from {}'.format(
             self.paths['dict']))
         self.ann = AnnoyIndex(self.vec_dim)
         self.ann.load(self.paths['ann'])
         utils.verbose('load lda annoy from {}'.format(self.paths['ann']))
     else:
         raise ValueError('Files under directory {} disappear'.format(
             self.model_dir))
Beispiel #13
0
 def load(self):
     if all([os.path.exists(i) for i in self.paths.values()]):
         self.model = models.TfidfModel.load(self.paths['model'])
         utils.verbose('Load tfidf model from {}'.format(
             self.paths['model']))
         self.dict = corpora.Dictionary.load(self.paths['dict'])
         utils.verbose('Load tfidf dictionary from {}'.format(
             self.paths['dict']))
         self.index = similarities.SparseMatrixSimilarity.load(
             self.paths['index'])
         # self.index = similarities.MatrixSimilarity.load(self.paths['index'])
         utils.verbose('Load tfidf index from {}'.format(
             self.paths['index']))
     else:
         raise ValueError('Files under directory {} disappear'.format(
             self.model_dir))
Beispiel #14
0
def process(args):
    utils.make_directory(args.path['model'])
    tokenizer = args.tokenizer(args.path['vocab'])
    train_batch = args.batch(tokenizer, args.max_lens)
    train_batch.set_data(utils.read_lines(args.path['train_x']),
                         utils.read_lines(args.path['train_y']))
    dev_batch = args.batch(tokenizer, args.max_lens)
    dev_batch.set_data(utils.read_lines(args.path['dev_x']),
                       utils.read_lines(args.path['dev_y']))
    model = args.model(args)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(pad_step_number=True)
        recorder = Recorder()
        starter = time.time()

        for i in range(args.max_steps):
            input_x, input_y, idx, update_epoch = train_batch.next_batch(
                args.batch_size, recorder.train_idx)
            train_features = {
                'input_x_ph': input_x,
                'input_y_ph': input_y,
                'keep_prob_ph': args.keep_prob
            }
            recorder.train_idx = idx
            train_fetches, train_feed = model.train_step(train_features)
            _, train_loss, train_acc = sess.run(train_fetches, train_feed)
            recorder.train_losses.append(train_loss)
            recorder.train_accs.append(train_acc)

            if not i % args.show_steps and i:
                input_x, input_y, idx, update_epoch = dev_batch.next_batch(
                    args.batch_size, recorder.dev_idx)
                dev_features = {
                    'input_x_ph': input_x,
                    'input_y_ph': input_y,
                    'keep_prob_ph': 1.0
                }
                recorder.dev_idx = idx
                dev_fetches, dev_feed = model.dev_step(dev_features)
                dev_loss, dev_acc = sess.run(dev_fetches, dev_feed)
                recorder.dev_losses.append(dev_loss)
                recorder.dev_accs.append(dev_acc)
                speed = args.show_steps / (time.time() - starter)
                utils.verbose(
                    r'        step {:05d} | train [{:.5f} {:.5f}] | '
                    r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format(
                        i, train_loss, train_acc, dev_loss, dev_acc, speed))
                starter = time.time()

            if not i % args.save_steps and i:
                features = recorder.stats()
                if features['save']:
                    saver.save(sess, args.path['model'])
                utils.verbose(
                    r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | '
                    r'dev [{:.5f} {:.5f}]'.format(i - args.save_steps, i,
                                                  features['train_loss'],
                                                  features['train_acc'],
                                                  features['dev_loss'],
                                                  features['dev_acc']))
                print('-+' * 55)
                utils.write_result(args, recorder.lowest_loss)

        utils.verbose('Start building vector space from dual encoder model')
        vectors = []
        infer_batch = args.batch(tokenizer, args.max_lens)
        infer_batch.set_data(utils.read_lines(args.path['train_x']),
                             utils.read_lines(args.path['train_y']))
        starter = time.time()
        idx = 0
        update_epoch = False
        i = 0
        while not update_epoch:
            input_x, input_y, idx, update_epoch = infer_batch.next_batch(
                args.batch_size, idx)
            infer_features = {'input_x_ph': input_x, 'keep_prob_ph': 1.0}
            infer_fetches, infer_feed = model.infer_step(infer_features)
            enc_questions = sess.run(infer_fetches, infer_feed)
            vectors += enc_questions
            if not i % args.show_steps and i:
                speed = args.show_steps / (time.time() - starter)
                utils.verbose('step : {:05d} | speed: {:.5f} it/s'.format(
                    i, speed))
                starter = time.time()
            i += 1
    vectors = np.reshape(np.array(vectors),
                         [-1, args.hidden])[:infer_batch.data_size]
    vec_dim = vectors.shape[-1]
    ann = AnnoyIndex(vec_dim)
    for n, ii in enumerate(vectors):
        ann.add_item(n, ii)
    ann.build(args.num_trees)
    ann.save(args.path['ann'])
    utils.verbose('Annoy has been dump in {}'.format(args.path['ann']))
Beispiel #15
0
# coding:utf-8
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function

from src import utils
from src.utils import args_utils
from src.data_utils import data_generator
from src.data_utils import vocab_generator


if __name__ == '__main__':
    hparams = args_utils.minor_args()
    utils.verbose('Start generating data')
    data_generator.process(hparams)
    utils.verbose('Finish generating data')
    utils.verbose('Start generating vocab')
    vocab_generator.process(hparams)
    utils.verbose('Finish generating vocab')
Beispiel #16
0
# coding:utf-8
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function

from src import utils
from src.utils import args_utils
from src.dual_encoder import trainer_lib as dual_encoder_trainer
from src.traditional import trainer_lib as traditional_trainer

trainer_index = {
    'dual_encoder': dual_encoder_trainer,
    'tfidf': traditional_trainer,
    'lda': traditional_trainer
}

if __name__ == '__main__':
    hparams = args_utils.major_args()
    if hparams.problem is None:
        raise ValueError('At least one problem must be announced')
    elif hparams.problem not in trainer_index:
        raise ValueError('Invalid problem: {}'.format(hparams.problem))
    else:
        utils.verbose('Start training problem: {}'.format(hparams.problem))
        trainer_index[hparams.problem].process(hparams)
        utils.verbose('Finish training problem: {}'.format(hparams.problem))
Beispiel #17
0
    def fit(self, list_toks):
        utils.verbose('start training lda dictionary')
        self.dict = corpora.Dictionary(list_toks)

        utils.verbose('start building lda corpus')
        self.corpus = [self.dict.doc2bow(toks) for toks in list_toks]

        utils.verbose('start training lda model')
        self.model = models.LdaMulticore(self.corpus,
                                         self.vec_dim,
                                         id2word=self.dict)

        utils.verbose('start saving lda dictionary and model')
        self.model.save(self.paths['model'])
        self.dict.save(self.paths['dict'])

        utils.verbose('start vectorization for lda')
        self.ann = AnnoyIndex(self.vec_dim)
        for n, toks in enumerate(list_toks):
            if not n % 10000 and n:
                utils.verbose('vectorizing {} lines for lda'.format(n))
            vec = self.get(toks)
            self.ann.add_item(n, vec)

        utils.verbose('start building lda ann')
        self.ann.build(self.num_trees)
        self.ann.save(self.paths['ann'])
        utils.verbose('dump lda annoy into {}'.format(self.paths['ann']))