Ejemplo n.º 1
0
def subtitle_process(video_data, frame_time, subtitle):
    if not exists(_mp.tokenize_subt):

        manager = Manager()
        tokenize_subt = manager.dict()
        video_data = manager.dict(video_data)
        frame_time = manager.dict(frame_time)
        subtitle = manager.dict(subtitle)

        keys = list(video_data.keys())
        align_func = partial(align_subtitle, video_data, frame_time, subtitle,
                             tokenize_subt)

        with Pool(4) as p, tqdm(total=len(keys),
                                desc="Align subtitle") as pbar:
            for _ in p.imap_unordered(align_func, keys):
                pbar.update()

        res = tokenize_subt.copy()
        du.json_dump(res, _mp.tokenize_subt)

    else:

        res = du.json_load(_mp.tokenize_subt)

    return res
Ejemplo n.º 2
0
def video_process(extract):
    """
    Start multi-thread to process video file. The video meta data is saved here.
    :param extract: boolean, extract or not.
    :return: None
    """
    fu.make_dirs(_mp.image_dir)
    # multiprocessing proxy manager
    manager = Manager()

    # video clips and data proxy object
    video_clips = manager.dict(get_videos_clips())
    video_data = manager.dict()
    keys = list(video_clips.keys())

    with Pool(16) as p, tqdm(total=len(keys),
                             desc="Check and extract videos") as pbar:
        check_func = partial(check_and_extract_videos, extract, video_clips,
                             video_data)
        # check and extract all videos. Each thread takes all videos from a movie once
        # at a time.
        for _ in p.imap_unordered(check_func, keys):
            pbar.update()

    du.json_dump(video_data.copy(), _mp.video_data_file)
Ejemplo n.º 3
0
def main():
    index = du.json_load(_mp.sample_index_file)
    subtitle = Subtitle().include(imdb_key=['tt0086190']).get()
    sample = du.json_load(_mp.sample_frame_file)
    qa = QA().include(imdb_key=['tt0086190']).get()

    # for ins in qa:
    #     if ins['video_clips']:
    #         print(ins['qid'])
    #         print(ins['question'])
    #         print(ins['answers'])
    #         print(ins['answers'][ins['correct_index']])
    ins = qa[0]
    spec = np.load(os.path.join(_mp.encode_dir, ins['qid'] + '_spec' + '.npy'))
    iid = [idx for i, idx in enumerate(index[ins['imdb_key']]) if spec[i] == 1]
    sentences = [subtitle[ins['imdb_key']]['lines'][idx] for idx in iid]
    imgs = []
    for v in sorted([fu.basename_wo_ext(n) for n in ins['video_clips']]):
        imgs.extend([
            os.path.join(_mp.image_dir, v, '%s_%05d.jpg' % (v, i + 1))
            for i in sample[ins['imdb_key']][v]
        ])
    print(len(imgs))
    for idx, img in enumerate(imgs):
        copy(
            img,
            os.path.join(_mp.benchmark_dir, 'pickup',
                         '%d_%s.jpg' % (idx, sentences[idx])))
    # ins['lines'] = sentences
    du.json_dump(ins, os.path.join(_mp.benchmark_dir, 'pickup.json'))
Ejemplo n.º 4
0
    def test(self):
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.OFF

        with tf.Session(config=config) as sess, tf.summary.FileWriter(self._log_dir, sess.graph) as sw:
            if debug:
                sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

            if args.checkpoint:
                print('Restore from', args.checkpoint)
                self.saver.restore(sess, args.checkpoint)
            else:
                print('Restore from', tf.train.latest_checkpoint(self._checkpoint_dir))
                self.saver.restore(sess, tf.train.latest_checkpoint(self._checkpoint_dir))

            summary, acc, max_acc, test = None, 0, 0, {}
            sess.run(self.test_init_op_list, feed_dict=self.test_data.feed_dict)

            for i in trange(len(self.test_data)):
                ans = sess.run(self.test_answer)
                qid = self.test_data.qa[self.test_data.index[i]]['qid']
                test[qid] = int(ans[:])

            du.json_dump(test, self._test_file)
Ejemplo n.º 5
0
    def create_tfrecord(self):
        target = {k: np.load(v) for k, v in self.target.items()}
        self.num_example, *check_tail = list(
            set(len(v) for v in target.values()))
        assert len(
            check_tail) == 0, 'Different length of targets. %s' % check_tail
        self.info['number_example'] = self.num_example
        self.info['num_shards'] = self.num_shards
        self.info['data'] = {}
        features = {}  # Example
        feature_list = {}  # Example

        for k in target.keys():
            # Iterate over targets
            t = target[k]
            # Save tensor information
            self.info['data'][k] = {
                'dim': t.ndim,
                'shape': t.shape,
                'dtype': str(t.dtype)
            }
            # t is numpy array
            dim = t.ndim
            # decide whether t is a sequence or not (dim == 3 )
            if dim == 3:
                feature_list[k] = t
            elif 0 < dim < 3:
                features[k] = t
            else:
                raise ValueError(
                    'Wrong dimension (%d) of target value. Can\'t be processed later.'
                    % dim)
            # else:
            #     def depth(l):
            #         return isinstance(l, list) and max(map(depth, l), default=0) + 1
            #
            #     def varlen(l):
            #         return any(map(varlen, l)) if depth(l) > 2 else len(set(map(len, l))) - 1
            #
            #     if depth(t) == 3:
            #         feature_list[k] = t
            #         if varlen(t):
            #             sequence_features[k] = tf.VarLenFeature(dtype=tf.as_dtype(du.probe_type(t)))
            #     elif 0 < depth(t) < 3:
            #         features[k] = t
            #         context_features = tf.FixedLenFeature(shape=[], dtype=tf.as_dtype(du.probe_type(t)))
            #     else:
            #         raise ValueError('Wrong depth (%d) of target value. Can\'t be processed later.' % depth(t))
        du.json_dump(self.info, self.info_file)
        deliverer = MailPerson(self.tfrecord_pattern, self.num_shards,
                               self.num_example, features, feature_list)

        with Pool(self.num_threads) as pool, tqdm(
                total=self.num_shards, desc=self.target_dir) as pbar:
            for _ in pool.imap_unordered(create_one_tfrecord, deliverer):
                pbar.update()
Ejemplo n.º 6
0
 def process():
     """
     Process frame time of each movie, and return a dictionary {imdb_key: a list of timestamp}
     :return frame_time: dictionary mapping imdb key to a list of timestamp
     """
     frame_time = {}
     frame_time_paths = glob(join(_mp.frame_time_dir, '*.matidx'))
     for p in tqdm(frame_time_paths, desc='Process frame time'):
         # fu.basename_wo_ext(p) -> imdb_key
         frame_time[fu.basename_wo_ext(p)] = FrameTime.get_frame_time(p)
     du.json_dump(frame_time, _mp.frame_time_file, indent=0)
     return frame_time
Ejemplo n.º 7
0
    def process():
        shot_boundary = {}
        sb_paths = glob(join(_mp.shot_boundary_dir, '*.sbd'))
        for p in tqdm(sb_paths, desc='Process shot boundary'):
            base_name = fu.basename_wo_ext(p)
            shot_boundary[base_name] = {'start': [], 'end': []}
            with open(p, 'r') as f:
                for match in SHOT_BOUNDARY_REGEX.finditer(f.read()):
                    shot_boundary[base_name]['start'].append(int(match.group(1)))
                    shot_boundary[base_name]['end'].append(int(match.group(2)))

        du.json_dump(shot_boundary, _mp.shot_boundary_file)
        return shot_boundary
Ejemplo n.º 8
0
 def inject_param(self, arg=None, val=None):
     if arg:
         for k in arg:
             if k in self.args_dict:
                 self.args_dict[k] = arg[k]
     if val:
         for k in val:
             if k in self.val_dict:
                 self.val_dict[k] = val[k]
     d = {'args': self.args_dict, 'val': self.val_dict}
     print('1*')
     pp.pprint(d)
     print('2*')
     du.json_dump(d, self.param_file)
     print('3*')
Ejemplo n.º 9
0
    def process():
        """
        Process subtitle files of movies. It will encode the subtitle with ISO-8859-1,
        and substitute new line or <> tokens with '\b' or '', and normalize the characters.
        :return subtitle: dictionary mapping imdb key to subtitle
        """
        subtitle = {}
        # print(_mp.subtitle_dir)
        subtitle_paths = glob(join(_mp.subtitle_dir, '*.srt'))
        # print(subtitle_paths)
        for p in tqdm(subtitle_paths, desc='Process subtitle'):
            iid = 0
            # basename imdb_key
            basename = fu.basename_wo_ext(p)
            subtitle[basename] = {'lines': [], 'start': [], 'end': []}
            with open(p, 'r', encoding='iso-8859-1') as f:
                for match in SRT_REGEX.finditer(f.read()):
                    raw_index, raw_start, raw_end, proprietary, content = match.groups()

                    content = re.sub(r'\r\n|\n', ' ', content)
                    content = re.sub(r'<.+?>', '', content, flags=re.DOTALL)
                    content = re.sub(r'[<>]', '', content)
                    content = normalize("NFKD", content)
                    content = content.encode('utf-8').decode('ascii', 'ignore').strip()

                    if content:
                        content = sent_tokenize(content)
                        content = [sent.strip() for sent in content if sent.strip()]
                        s = Subtitle.timestamp_to_secs(raw_start)
                        e = Subtitle.timestamp_to_secs(raw_end)
                        if s > e:
                            s, e = e, s
                        time_span = (e - s) / len(content)
                        for idx, sent in enumerate(content):
                            subtitle[basename]['start'].append(s + time_span * idx)
                            subtitle[basename]['end'].append(s + time_span * (idx + 1))
                            subtitle[basename]['lines'].append(sent)
                    iid += 1
            index = np.argsort(np.array(subtitle[basename]['start']))
            subtitle[basename]['start'] = [subtitle[basename]['start'][idx] for idx in index]
            subtitle[basename]['end'] = [subtitle[basename]['end'][idx] for idx in index]
            subtitle[basename]['lines'] = [subtitle[basename]['lines'][idx] for idx in index]

        du.json_dump(subtitle, _mp.subtitle_file, indent=0)
        return subtitle
Ejemplo n.º 10
0
def create_vocab(qa, subtitle, video_data, gram_vocab, gram_embed):
    if not os.path.exists(_mp.vocab_file):
        vocab = Counter()

        for key in tqdm(video_data, desc='Tokenize Subtitle'):
            subt = subtitle[key]
            for idx, line in enumerate(subt['lines']):
                line = wordpunct_tokenize(line.strip().lower())
                vocab.update(line)
                subt['lines'][idx] = line

        for ins in tqdm(qa, desc='Tokenize QA'):
            ins['question'] = wordpunct_tokenize(
                ins['question'].strip().lower())
            vocab.update(ins['question'])
            ins['answers'] = [
                wordpunct_tokenize(sent.strip().lower()) if sent else ['.']
                for sent in ins['answers']
            ]
            for sent in ins['answers']:
                vocab.update(sent)

        filter_vocab, idx_vocab = {}, 1

        frequency = {}
        vocab_embed = np.zeros((len(vocab) + 1, gram_embed.shape[1]),
                               dtype=np.float32)
        for v in tqdm(vocab, desc='Create Embedding'):
            v_ = '<' + v + '>'
            v_gram = [c for c in v] + [v_[i:i + 3] for i in range(len(v_) - 2)] + \
                     [v_[i:i + 6] for i in range(len(v_) - 5)]
            v_gram_code = [
                gram_vocab[gram] for gram in v_gram if gram in gram_vocab
            ]
            if v_gram_code:
                frequency[v] = vocab[v]
                filter_vocab[v] = idx_vocab
                vocab_embed[idx_vocab] = np.sum(gram_embed[v_gram_code],
                                                axis=0)
                idx_vocab += 1
        vocab_embed = vocab_embed[:idx_vocab]
        du.json_dump(frequency, _mp.freq_file)
        du.json_dump(filter_vocab, _mp.vocab_file)
        np.save(_mp.embedding_file, vocab_embed)
        print(len(vocab_embed))
        du.json_dump(subtitle, _mp.temp_subtitle_file)
        du.json_dump(qa, _mp.tokenize_qa)
    else:
        filter_vocab = du.json_load(_mp.vocab_file)
        subtitle = du.json_load(_mp.temp_subtitle_file)
        vocab_embed = np.load(_mp.embedding_file)
        frequency = du.json_load(_mp.freq_file)
        qa = du.json_load(_mp.tokenize_qa)

    return filter_vocab, subtitle, vocab_embed, frequency, qa
Ejemplo n.º 11
0
def load_embedding_vec(target):
    """
    Load word embedding of different method. You can refer to EmbeddingPath
    to prepare data.
    :param target: string, target word embedding
    :return embedding_keys, embedding_vecs: list of words, and numpy array of embedding vector
    """
    start_time = time.time()
    # File and function setup for embedding
    if target == 'glove':
        key_file = cp.glove_embedding_key_file
        vec_file = cp.glove_embedding_vec_file
        raw_file = cp.glove_file
        load_fn = load_glove
    elif target == 'w2v':
        key_file = cp.w2v_embedding_key_file
        vec_file = cp.w2v_embedding_vec_file
        raw_file = cp.word2vec_file
        load_fn = load_w2v
    elif target == 'fasttext':
        key_file = cp.ft_embedding_key_file
        vec_file = cp.ft_embedding_vec_file
        raw_file = cp.fasttext_file
        load_fn = load_glove
    else:
        key_file = None
        vec_file = None
        raw_file = None
        load_fn = None

    # Check if there already exists pre-loaded file
    if exists(key_file) and exists(vec_file):
        embedding_keys = du.json_load(key_file)
        embedding_vecs = np.load(vec_file)
    else:
        # If not, load it from text file
        embedding_keys, embedding_vecs = load_fn(raw_file)
        du.json_dump(embedding_keys, key_file)
        np.save(vec_file, embedding_vecs)

    print('Loading embedding done. %.3f s' % (time.time() - start_time))
    return embedding_keys, embedding_vecs
Ejemplo n.º 12
0
def tokenize_question_answer(qa):
    if not exists(_mp.tokenize_qa):

        tokenize_qa = []

        for ins in tqdm(qa, desc='Tokenize qa'):
            ins['question'] = word_tokenize(ins['question'].strip().lower())
            ins['answers'] = [
                word_tokenize(sent.strip().lower()) if sent else ['.']
                for sent in ins['answers']
            ]
            tokenize_qa.append(ins)

        du.json_dump(tokenize_qa, _mp.tokenize_qa)

    else:

        tokenize_qa = du.json_load(_mp.tokenize_qa)

    return tokenize_qa
Ejemplo n.º 13
0
def subtitle_process(video_data, frame_time, subtitle):
    manager = Manager()
    sample = manager.dict()
    index = manager.dict()
    video_data = manager.dict(video_data)
    frame_time = manager.dict(frame_time)
    subtitle = manager.dict(subtitle)

    keys = list(video_data.keys())
    if args.one:
        align_func = partial(sample_frame_v2, video_data, frame_time, subtitle,
                             sample, index)
    else:
        align_func = partial(sample_frame, video_data, frame_time, subtitle,
                             sample, index)

    with Pool(4) as p, tqdm(total=len(keys), desc="Align subtitle") as pbar:
        for _ in p.imap_unordered(align_func, keys):
            pbar.update()

    du.json_dump(sample.copy(), _mp.sample_frame_file)
    du.json_dump(index.copy(), _mp.sample_index_file)
    return sample.copy()
Ejemplo n.º 14
0
def create_vocab_embedding(tokenize_subt, tokenize_qa):
    if not exists(_mp.embedding_file) or not exists(_mp.vocab_file):

        vocab = create_vocab(tokenize_subt, tokenize_qa)
        filter_vocab, idx_vocab = {}, 1
        gram_vocab = {
            k: i
            for i, k in enumerate(du.json_load(_ep.gram_vocab_file))
        }
        gram_embed = np.load(_ep.gram_embedding_vec_file)
        vocab_embed = np.zeros((len(vocab) + 1, gram_embed.shape[1]),
                               dtype=np.float32)

        for idx, v in enumerate(
                tqdm(vocab, desc='Create embedding of vocabulary')):
            v_ = '<' + v + '>'
            v_gram = [c for c in v_] + [v_[i:i + 3] for i in range(len(v_) - 2)] + \
                     [v_[i:i + 6] for i in range(len(v_) - 5)]
            v_gram_code = [
                gram_vocab[gram] for gram in v_gram if gram in gram_vocab
            ]
            if v_gram_code:
                filter_vocab[v] = idx_vocab
                vocab_embed[idx + 1] = np.sum(gram_embed[v_gram_code], axis=0)
                idx_vocab += 1

        norm = np.linalg.norm(vocab_embed, axis=1, keepdims=True)
        norm = np.select([norm > 0], [norm], default=1.)
        print(norm.shape)
        norm_vocab_embed = vocab_embed / norm
        print(norm_vocab_embed.shape)

        du.json_dump(filter_vocab, _mp.vocab_file)
        np.save(_mp.embedding_file, norm_vocab_embed)
    else:
        filter_vocab = du.json_load(_mp.vocab_file)
    return filter_vocab
Ejemplo n.º 15
0
def encode_sentence(tokenize_subt, tokenize_qa, vocab):
    if not exists(_mp.encode_subtitle_file) or not exists(_mp.encode_qa_file):

        encode_subt, encode_qa = {}, tokenize_qa
        for imdb in tqdm(tokenize_subt, desc='Encode subtitle'):
            encode_subt[imdb] = {}
            for v in tokenize_subt[imdb]:
                encode_subt[imdb][v] = []
                for sent in tokenize_subt[imdb][v]:
                    temp = [vocab[w] for w in sent if w in vocab]
                    if temp:
                        encode_subt[imdb][v].append(temp)
                    else:
                        encode_subt[imdb][v].append([vocab['.']])

        for ins in tqdm(encode_qa, desc='Encode question answer'):
            temp = [vocab[w] for w in ins['question'] if w in vocab]
            if temp:
                ins['question'] = temp
            else:
                ins['question'] = [vocab['.']]

            for idx, a in enumerate(ins['answers']):
                temp = [vocab[w] for w in a if w in vocab]
                if temp:
                    ins['answers'][idx] = temp
                else:
                    ins['answers'][idx] = [vocab['.']]

        du.json_dump(encode_subt, _mp.encode_subtitle_file)
        du.json_dump(encode_qa, _mp.encode_qa_file)
    else:
        encode_subt = du.json_load(_mp.encode_subtitle_file)
        encode_qa = du.json_load(_mp.encode_qa_file)

    return encode_subt, encode_qa
Ejemplo n.º 16
0
 def _update_exp(self, item):
     self.exp.update(item)
     du.json_dump(self.exp, config.exp_file)
Ejemplo n.º 17
0
def main():
    start_time = time.time()
    video_data = du.json_load(config.video_data_file)
    video_subtitle = du.json_load(config.subtitle_file)
    shot_boundary = du.json_load(config.shot_boundary_file)
    subtitle_shot = du.json_load(config.subtitle_shot_file)

    qa = json.load(open(config.qa_file, 'r'))

    embed_file = None
    avail_embed_file = None
    avail_embed_npy_file = None
    if args.embedding == 'word2vec':
        embed_file = config.word2vec_file
        avail_embed_file = config.w2v_embedding_file
        avail_embed_npy_file = config.w2v_embedding_npy_file
    elif args.embedding == 'fasttext':
        embed_file = config.fasttext_file
        avail_embed_file = config.ft_embedding_file
        avail_embed_npy_file = config.ft_embedding_npy_file
    elif args.embedding == 'glove':
        embed_file = config.glove_file
        avail_embed_file = config.glove_embedding_file
        avail_embed_npy_file = config.glove_embedding_npy_file

    embedding = None
    embed_exist = exists(avail_embed_file) and exists(avail_embed_npy_file)
    if embed_file and not embed_exist:
        embedding = du.load_w2v(embed_file)

    print('Loading json file done!! Take %.4f sec.' %
          (time.time() - start_time))

    total_qa = get_split(qa, video_data)

    print('Available qa # : train | tests | val ')
    print('                 %5d   %4d   %3d' %
          (len([0 for qa_ in total_qa['train'] if qa_['avail']
                ]), len([0 for qa_ in total_qa['tests'] if qa_['avail']]),
           len([0 for qa_ in total_qa['val'] if qa_['avail']])))
    print('Mv+Sub qa # :    train | tests | val ')
    print('                 %5d   %4d   %3d' %
          (len([0 for qa_ in total_qa['train'] if qa_['mv+sub']
                ]), len([0 for qa_ in total_qa['tests'] if qa_['mv+sub']]),
           len([0 for qa_ in total_qa['val'] if qa_['mv+sub']])))
    print('Total qa # :     train | tests | val ')
    print(
        '                 %5d   %4d   %3d' %
        (len(total_qa['train']), len(total_qa['tests']), len(total_qa['val'])))

    tokenize_qa_train, vocab_counter = \
        tokenize_sentences(total_qa['train'],
                           video_subtitle,
                           is_train=not embed_exist)
    # Build vocab
    if embed_file:
        if not embed_exist:
            qa_embedding, vocab, inverse_vocab = build_vocab(
                vocab_counter, embedding)
            fu.safe_remove(avail_embed_file)
            du.json_dump(inverse_vocab, avail_embed_file)
            fu.safe_remove(avail_embed_npy_file)
            np.save(
                avail_embed_npy_file,
                np.array([e for e in qa_embedding.values()], dtype=np.float32))
        else:
            inverse_vocab = du.json_load(avail_embed_file)
            vocab = {k: i for i, k in enumerate(inverse_vocab)}
    else:
        _, vocab, inverse_vocab = build_vocab(vocab_counter)

    # encode sentences
    tokenize_qa_test, _ = tokenize_sentences(total_qa['tests'], video_subtitle)
    tokenize_qa_val, _ = tokenize_sentences(total_qa['val'], video_subtitle)
    encode_sub = encode_subtitles(video_subtitle, vocab, shot_boundary,
                                  subtitle_shot)
    encode_qa_train = encode_sentences(tokenize_qa_train, vocab)
    encode_qa_test = encode_sentences(tokenize_qa_test, vocab)
    encode_qa_val = encode_sentences(tokenize_qa_val, vocab)

    tokenize_qa = {
        'tokenize_qa_train': tokenize_qa_train,
        'tokenize_qa_test': tokenize_qa_test,
        'tokenize_qa_val': tokenize_qa_val,
    }

    encode_qa = {
        'encode_qa_train': encode_qa_train,
        'encode_qa_test': encode_qa_test,
        'encode_qa_val': encode_qa_val,
    }
    vocab_all = {
        'vocab': vocab,
        'inverse_vocab': inverse_vocab,
    }

    fu.safe_remove(total_qa_file_name)
    fu.safe_remove(tokenize_file_name)
    fu.safe_remove(encode_file_name)
    fu.safe_remove(all_vocab_file_name)
    fu.safe_remove(config.encode_subtitle_file)

    du.json_dump(total_qa, total_qa_file_name)
    du.json_dump(tokenize_qa, tokenize_file_name)
    du.json_dump(encode_qa, encode_file_name)
    du.json_dump(vocab_all, all_vocab_file_name)
    du.json_dump(encode_sub, config.encode_subtitle_file)
Ejemplo n.º 18
0
 def save(self):
     du.json_dump(self.args, self.param_file)
Ejemplo n.º 19
0
def process(args):
    """
    Process word embedding. We split each words into n-gram character tokens, and
    save the intermediate file for later training.
    :param args: named tuple, arguments
    :return: None
    """
    embedding_keys, embedding_vector = load_embedding_vec(cp.target)

    fu.block_print(['%s\'s # of embedding: %d' % (cp.target, len(embedding_keys)),
                    '%s\'s shape of embedding vec: %s' % (cp.target, str(embedding_vector.shape))])

    print('Length set of words:\n', set([len(k) for k in embedding_keys]))

    embedding_keys, embedding_vector = filter_stat(embedding_keys, embedding_vector, cp.max_length)

    counter_1gram = Counter()
    counter_3gram = Counter()
    counter_6gram = Counter()
    size_set = set()
    gram_embedding_keys = [[] for _ in range(len(embedding_keys))]

    max_size = 0

    # Update counter and divide each word to n-gram.
    for idx, k in enumerate(tqdm(embedding_keys, desc='Counting')):
        counter_1gram.update(k)
        gram_embedding_keys[idx].extend(k)
        k = '<' + k + '>'
        three_gram = [k[i:i + 3] for i in range(len(k) - 2)]
        counter_3gram.update(three_gram)
        gram_embedding_keys[idx].extend(three_gram)
        six_gram = [k[i:i + 6] for i in range(len(k) - 5)]
        counter_6gram.update(six_gram)
        gram_embedding_keys[idx].extend(six_gram)
        size_set.add(len(gram_embedding_keys[idx]))
        if max_size < len(gram_embedding_keys[idx]):
            max_size = len(gram_embedding_keys[idx])

    print('Max size of tokens:', max_size)
    print('Size set of tokens:', size_set)
    print('Number of grams:\n',
          '1-gram:', len(counter_1gram), '3-gram:', len(counter_3gram), '6-gram:', len(counter_6gram))

    if not args.debug:
        du.json_dump({'1': counter_1gram, '3': counter_3gram, '6': counter_6gram}, cp.gram_counter_file)
        vocab = list(counter_1gram) + list(counter_3gram) + list(counter_6gram) + [UNK]
        du.json_dump(vocab, cp.gram_vocab_file)
        gtoi = {gram: idx for idx, gram in enumerate(vocab)}
        encoded_embedding_keys = np.ones((len(embedding_keys), max_size), dtype=np.int64) * (len(gtoi) - 1)
        for idx, k in enumerate(tqdm(gram_embedding_keys, desc='Encoding')):
            encoded_embedding_keys[idx, :len(k)] = [gtoi[gram] for gram in k]
        print(encoded_embedding_keys[:5])
        assert len(encoded_embedding_keys) == len(embedding_vector), \
            'First dimensions of encoded keys and vectors are not matched.'
        start_time = time.time()
        fu.safe_remove(cp.encode_embedding_key_file)
        fu.safe_remove(cp.encode_embedding_vec_file)
        np.save(cp.encode_embedding_key_file, encoded_embedding_keys)
        np.save(cp.encode_embedding_vec_file, embedding_vector)
        print('Saveing processed data with %.3f s' % (time.time() - start_time))