def subtitle_process(video_data, frame_time, subtitle): if not exists(_mp.tokenize_subt): manager = Manager() tokenize_subt = manager.dict() video_data = manager.dict(video_data) frame_time = manager.dict(frame_time) subtitle = manager.dict(subtitle) keys = list(video_data.keys()) align_func = partial(align_subtitle, video_data, frame_time, subtitle, tokenize_subt) with Pool(4) as p, tqdm(total=len(keys), desc="Align subtitle") as pbar: for _ in p.imap_unordered(align_func, keys): pbar.update() res = tokenize_subt.copy() du.json_dump(res, _mp.tokenize_subt) else: res = du.json_load(_mp.tokenize_subt) return res
def video_process(extract): """ Start multi-thread to process video file. The video meta data is saved here. :param extract: boolean, extract or not. :return: None """ fu.make_dirs(_mp.image_dir) # multiprocessing proxy manager manager = Manager() # video clips and data proxy object video_clips = manager.dict(get_videos_clips()) video_data = manager.dict() keys = list(video_clips.keys()) with Pool(16) as p, tqdm(total=len(keys), desc="Check and extract videos") as pbar: check_func = partial(check_and_extract_videos, extract, video_clips, video_data) # check and extract all videos. Each thread takes all videos from a movie once # at a time. for _ in p.imap_unordered(check_func, keys): pbar.update() du.json_dump(video_data.copy(), _mp.video_data_file)
def main(): index = du.json_load(_mp.sample_index_file) subtitle = Subtitle().include(imdb_key=['tt0086190']).get() sample = du.json_load(_mp.sample_frame_file) qa = QA().include(imdb_key=['tt0086190']).get() # for ins in qa: # if ins['video_clips']: # print(ins['qid']) # print(ins['question']) # print(ins['answers']) # print(ins['answers'][ins['correct_index']]) ins = qa[0] spec = np.load(os.path.join(_mp.encode_dir, ins['qid'] + '_spec' + '.npy')) iid = [idx for i, idx in enumerate(index[ins['imdb_key']]) if spec[i] == 1] sentences = [subtitle[ins['imdb_key']]['lines'][idx] for idx in iid] imgs = [] for v in sorted([fu.basename_wo_ext(n) for n in ins['video_clips']]): imgs.extend([ os.path.join(_mp.image_dir, v, '%s_%05d.jpg' % (v, i + 1)) for i in sample[ins['imdb_key']][v] ]) print(len(imgs)) for idx, img in enumerate(imgs): copy( img, os.path.join(_mp.benchmark_dir, 'pickup', '%d_%s.jpg' % (idx, sentences[idx]))) # ins['lines'] = sentences du.json_dump(ins, os.path.join(_mp.benchmark_dir, 'pickup.json'))
def test(self): config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.OFF with tf.Session(config=config) as sess, tf.summary.FileWriter(self._log_dir, sess.graph) as sw: if debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) if args.checkpoint: print('Restore from', args.checkpoint) self.saver.restore(sess, args.checkpoint) else: print('Restore from', tf.train.latest_checkpoint(self._checkpoint_dir)) self.saver.restore(sess, tf.train.latest_checkpoint(self._checkpoint_dir)) summary, acc, max_acc, test = None, 0, 0, {} sess.run(self.test_init_op_list, feed_dict=self.test_data.feed_dict) for i in trange(len(self.test_data)): ans = sess.run(self.test_answer) qid = self.test_data.qa[self.test_data.index[i]]['qid'] test[qid] = int(ans[:]) du.json_dump(test, self._test_file)
def create_tfrecord(self): target = {k: np.load(v) for k, v in self.target.items()} self.num_example, *check_tail = list( set(len(v) for v in target.values())) assert len( check_tail) == 0, 'Different length of targets. %s' % check_tail self.info['number_example'] = self.num_example self.info['num_shards'] = self.num_shards self.info['data'] = {} features = {} # Example feature_list = {} # Example for k in target.keys(): # Iterate over targets t = target[k] # Save tensor information self.info['data'][k] = { 'dim': t.ndim, 'shape': t.shape, 'dtype': str(t.dtype) } # t is numpy array dim = t.ndim # decide whether t is a sequence or not (dim == 3 ) if dim == 3: feature_list[k] = t elif 0 < dim < 3: features[k] = t else: raise ValueError( 'Wrong dimension (%d) of target value. Can\'t be processed later.' % dim) # else: # def depth(l): # return isinstance(l, list) and max(map(depth, l), default=0) + 1 # # def varlen(l): # return any(map(varlen, l)) if depth(l) > 2 else len(set(map(len, l))) - 1 # # if depth(t) == 3: # feature_list[k] = t # if varlen(t): # sequence_features[k] = tf.VarLenFeature(dtype=tf.as_dtype(du.probe_type(t))) # elif 0 < depth(t) < 3: # features[k] = t # context_features = tf.FixedLenFeature(shape=[], dtype=tf.as_dtype(du.probe_type(t))) # else: # raise ValueError('Wrong depth (%d) of target value. Can\'t be processed later.' % depth(t)) du.json_dump(self.info, self.info_file) deliverer = MailPerson(self.tfrecord_pattern, self.num_shards, self.num_example, features, feature_list) with Pool(self.num_threads) as pool, tqdm( total=self.num_shards, desc=self.target_dir) as pbar: for _ in pool.imap_unordered(create_one_tfrecord, deliverer): pbar.update()
def process(): """ Process frame time of each movie, and return a dictionary {imdb_key: a list of timestamp} :return frame_time: dictionary mapping imdb key to a list of timestamp """ frame_time = {} frame_time_paths = glob(join(_mp.frame_time_dir, '*.matidx')) for p in tqdm(frame_time_paths, desc='Process frame time'): # fu.basename_wo_ext(p) -> imdb_key frame_time[fu.basename_wo_ext(p)] = FrameTime.get_frame_time(p) du.json_dump(frame_time, _mp.frame_time_file, indent=0) return frame_time
def process(): shot_boundary = {} sb_paths = glob(join(_mp.shot_boundary_dir, '*.sbd')) for p in tqdm(sb_paths, desc='Process shot boundary'): base_name = fu.basename_wo_ext(p) shot_boundary[base_name] = {'start': [], 'end': []} with open(p, 'r') as f: for match in SHOT_BOUNDARY_REGEX.finditer(f.read()): shot_boundary[base_name]['start'].append(int(match.group(1))) shot_boundary[base_name]['end'].append(int(match.group(2))) du.json_dump(shot_boundary, _mp.shot_boundary_file) return shot_boundary
def inject_param(self, arg=None, val=None): if arg: for k in arg: if k in self.args_dict: self.args_dict[k] = arg[k] if val: for k in val: if k in self.val_dict: self.val_dict[k] = val[k] d = {'args': self.args_dict, 'val': self.val_dict} print('1*') pp.pprint(d) print('2*') du.json_dump(d, self.param_file) print('3*')
def process(): """ Process subtitle files of movies. It will encode the subtitle with ISO-8859-1, and substitute new line or <> tokens with '\b' or '', and normalize the characters. :return subtitle: dictionary mapping imdb key to subtitle """ subtitle = {} # print(_mp.subtitle_dir) subtitle_paths = glob(join(_mp.subtitle_dir, '*.srt')) # print(subtitle_paths) for p in tqdm(subtitle_paths, desc='Process subtitle'): iid = 0 # basename imdb_key basename = fu.basename_wo_ext(p) subtitle[basename] = {'lines': [], 'start': [], 'end': []} with open(p, 'r', encoding='iso-8859-1') as f: for match in SRT_REGEX.finditer(f.read()): raw_index, raw_start, raw_end, proprietary, content = match.groups() content = re.sub(r'\r\n|\n', ' ', content) content = re.sub(r'<.+?>', '', content, flags=re.DOTALL) content = re.sub(r'[<>]', '', content) content = normalize("NFKD", content) content = content.encode('utf-8').decode('ascii', 'ignore').strip() if content: content = sent_tokenize(content) content = [sent.strip() for sent in content if sent.strip()] s = Subtitle.timestamp_to_secs(raw_start) e = Subtitle.timestamp_to_secs(raw_end) if s > e: s, e = e, s time_span = (e - s) / len(content) for idx, sent in enumerate(content): subtitle[basename]['start'].append(s + time_span * idx) subtitle[basename]['end'].append(s + time_span * (idx + 1)) subtitle[basename]['lines'].append(sent) iid += 1 index = np.argsort(np.array(subtitle[basename]['start'])) subtitle[basename]['start'] = [subtitle[basename]['start'][idx] for idx in index] subtitle[basename]['end'] = [subtitle[basename]['end'][idx] for idx in index] subtitle[basename]['lines'] = [subtitle[basename]['lines'][idx] for idx in index] du.json_dump(subtitle, _mp.subtitle_file, indent=0) return subtitle
def create_vocab(qa, subtitle, video_data, gram_vocab, gram_embed): if not os.path.exists(_mp.vocab_file): vocab = Counter() for key in tqdm(video_data, desc='Tokenize Subtitle'): subt = subtitle[key] for idx, line in enumerate(subt['lines']): line = wordpunct_tokenize(line.strip().lower()) vocab.update(line) subt['lines'][idx] = line for ins in tqdm(qa, desc='Tokenize QA'): ins['question'] = wordpunct_tokenize( ins['question'].strip().lower()) vocab.update(ins['question']) ins['answers'] = [ wordpunct_tokenize(sent.strip().lower()) if sent else ['.'] for sent in ins['answers'] ] for sent in ins['answers']: vocab.update(sent) filter_vocab, idx_vocab = {}, 1 frequency = {} vocab_embed = np.zeros((len(vocab) + 1, gram_embed.shape[1]), dtype=np.float32) for v in tqdm(vocab, desc='Create Embedding'): v_ = '<' + v + '>' v_gram = [c for c in v] + [v_[i:i + 3] for i in range(len(v_) - 2)] + \ [v_[i:i + 6] for i in range(len(v_) - 5)] v_gram_code = [ gram_vocab[gram] for gram in v_gram if gram in gram_vocab ] if v_gram_code: frequency[v] = vocab[v] filter_vocab[v] = idx_vocab vocab_embed[idx_vocab] = np.sum(gram_embed[v_gram_code], axis=0) idx_vocab += 1 vocab_embed = vocab_embed[:idx_vocab] du.json_dump(frequency, _mp.freq_file) du.json_dump(filter_vocab, _mp.vocab_file) np.save(_mp.embedding_file, vocab_embed) print(len(vocab_embed)) du.json_dump(subtitle, _mp.temp_subtitle_file) du.json_dump(qa, _mp.tokenize_qa) else: filter_vocab = du.json_load(_mp.vocab_file) subtitle = du.json_load(_mp.temp_subtitle_file) vocab_embed = np.load(_mp.embedding_file) frequency = du.json_load(_mp.freq_file) qa = du.json_load(_mp.tokenize_qa) return filter_vocab, subtitle, vocab_embed, frequency, qa
def load_embedding_vec(target): """ Load word embedding of different method. You can refer to EmbeddingPath to prepare data. :param target: string, target word embedding :return embedding_keys, embedding_vecs: list of words, and numpy array of embedding vector """ start_time = time.time() # File and function setup for embedding if target == 'glove': key_file = cp.glove_embedding_key_file vec_file = cp.glove_embedding_vec_file raw_file = cp.glove_file load_fn = load_glove elif target == 'w2v': key_file = cp.w2v_embedding_key_file vec_file = cp.w2v_embedding_vec_file raw_file = cp.word2vec_file load_fn = load_w2v elif target == 'fasttext': key_file = cp.ft_embedding_key_file vec_file = cp.ft_embedding_vec_file raw_file = cp.fasttext_file load_fn = load_glove else: key_file = None vec_file = None raw_file = None load_fn = None # Check if there already exists pre-loaded file if exists(key_file) and exists(vec_file): embedding_keys = du.json_load(key_file) embedding_vecs = np.load(vec_file) else: # If not, load it from text file embedding_keys, embedding_vecs = load_fn(raw_file) du.json_dump(embedding_keys, key_file) np.save(vec_file, embedding_vecs) print('Loading embedding done. %.3f s' % (time.time() - start_time)) return embedding_keys, embedding_vecs
def tokenize_question_answer(qa): if not exists(_mp.tokenize_qa): tokenize_qa = [] for ins in tqdm(qa, desc='Tokenize qa'): ins['question'] = word_tokenize(ins['question'].strip().lower()) ins['answers'] = [ word_tokenize(sent.strip().lower()) if sent else ['.'] for sent in ins['answers'] ] tokenize_qa.append(ins) du.json_dump(tokenize_qa, _mp.tokenize_qa) else: tokenize_qa = du.json_load(_mp.tokenize_qa) return tokenize_qa
def subtitle_process(video_data, frame_time, subtitle): manager = Manager() sample = manager.dict() index = manager.dict() video_data = manager.dict(video_data) frame_time = manager.dict(frame_time) subtitle = manager.dict(subtitle) keys = list(video_data.keys()) if args.one: align_func = partial(sample_frame_v2, video_data, frame_time, subtitle, sample, index) else: align_func = partial(sample_frame, video_data, frame_time, subtitle, sample, index) with Pool(4) as p, tqdm(total=len(keys), desc="Align subtitle") as pbar: for _ in p.imap_unordered(align_func, keys): pbar.update() du.json_dump(sample.copy(), _mp.sample_frame_file) du.json_dump(index.copy(), _mp.sample_index_file) return sample.copy()
def create_vocab_embedding(tokenize_subt, tokenize_qa): if not exists(_mp.embedding_file) or not exists(_mp.vocab_file): vocab = create_vocab(tokenize_subt, tokenize_qa) filter_vocab, idx_vocab = {}, 1 gram_vocab = { k: i for i, k in enumerate(du.json_load(_ep.gram_vocab_file)) } gram_embed = np.load(_ep.gram_embedding_vec_file) vocab_embed = np.zeros((len(vocab) + 1, gram_embed.shape[1]), dtype=np.float32) for idx, v in enumerate( tqdm(vocab, desc='Create embedding of vocabulary')): v_ = '<' + v + '>' v_gram = [c for c in v_] + [v_[i:i + 3] for i in range(len(v_) - 2)] + \ [v_[i:i + 6] for i in range(len(v_) - 5)] v_gram_code = [ gram_vocab[gram] for gram in v_gram if gram in gram_vocab ] if v_gram_code: filter_vocab[v] = idx_vocab vocab_embed[idx + 1] = np.sum(gram_embed[v_gram_code], axis=0) idx_vocab += 1 norm = np.linalg.norm(vocab_embed, axis=1, keepdims=True) norm = np.select([norm > 0], [norm], default=1.) print(norm.shape) norm_vocab_embed = vocab_embed / norm print(norm_vocab_embed.shape) du.json_dump(filter_vocab, _mp.vocab_file) np.save(_mp.embedding_file, norm_vocab_embed) else: filter_vocab = du.json_load(_mp.vocab_file) return filter_vocab
def encode_sentence(tokenize_subt, tokenize_qa, vocab): if not exists(_mp.encode_subtitle_file) or not exists(_mp.encode_qa_file): encode_subt, encode_qa = {}, tokenize_qa for imdb in tqdm(tokenize_subt, desc='Encode subtitle'): encode_subt[imdb] = {} for v in tokenize_subt[imdb]: encode_subt[imdb][v] = [] for sent in tokenize_subt[imdb][v]: temp = [vocab[w] for w in sent if w in vocab] if temp: encode_subt[imdb][v].append(temp) else: encode_subt[imdb][v].append([vocab['.']]) for ins in tqdm(encode_qa, desc='Encode question answer'): temp = [vocab[w] for w in ins['question'] if w in vocab] if temp: ins['question'] = temp else: ins['question'] = [vocab['.']] for idx, a in enumerate(ins['answers']): temp = [vocab[w] for w in a if w in vocab] if temp: ins['answers'][idx] = temp else: ins['answers'][idx] = [vocab['.']] du.json_dump(encode_subt, _mp.encode_subtitle_file) du.json_dump(encode_qa, _mp.encode_qa_file) else: encode_subt = du.json_load(_mp.encode_subtitle_file) encode_qa = du.json_load(_mp.encode_qa_file) return encode_subt, encode_qa
def _update_exp(self, item): self.exp.update(item) du.json_dump(self.exp, config.exp_file)
def main(): start_time = time.time() video_data = du.json_load(config.video_data_file) video_subtitle = du.json_load(config.subtitle_file) shot_boundary = du.json_load(config.shot_boundary_file) subtitle_shot = du.json_load(config.subtitle_shot_file) qa = json.load(open(config.qa_file, 'r')) embed_file = None avail_embed_file = None avail_embed_npy_file = None if args.embedding == 'word2vec': embed_file = config.word2vec_file avail_embed_file = config.w2v_embedding_file avail_embed_npy_file = config.w2v_embedding_npy_file elif args.embedding == 'fasttext': embed_file = config.fasttext_file avail_embed_file = config.ft_embedding_file avail_embed_npy_file = config.ft_embedding_npy_file elif args.embedding == 'glove': embed_file = config.glove_file avail_embed_file = config.glove_embedding_file avail_embed_npy_file = config.glove_embedding_npy_file embedding = None embed_exist = exists(avail_embed_file) and exists(avail_embed_npy_file) if embed_file and not embed_exist: embedding = du.load_w2v(embed_file) print('Loading json file done!! Take %.4f sec.' % (time.time() - start_time)) total_qa = get_split(qa, video_data) print('Available qa # : train | tests | val ') print(' %5d %4d %3d' % (len([0 for qa_ in total_qa['train'] if qa_['avail'] ]), len([0 for qa_ in total_qa['tests'] if qa_['avail']]), len([0 for qa_ in total_qa['val'] if qa_['avail']]))) print('Mv+Sub qa # : train | tests | val ') print(' %5d %4d %3d' % (len([0 for qa_ in total_qa['train'] if qa_['mv+sub'] ]), len([0 for qa_ in total_qa['tests'] if qa_['mv+sub']]), len([0 for qa_ in total_qa['val'] if qa_['mv+sub']]))) print('Total qa # : train | tests | val ') print( ' %5d %4d %3d' % (len(total_qa['train']), len(total_qa['tests']), len(total_qa['val']))) tokenize_qa_train, vocab_counter = \ tokenize_sentences(total_qa['train'], video_subtitle, is_train=not embed_exist) # Build vocab if embed_file: if not embed_exist: qa_embedding, vocab, inverse_vocab = build_vocab( vocab_counter, embedding) fu.safe_remove(avail_embed_file) du.json_dump(inverse_vocab, avail_embed_file) fu.safe_remove(avail_embed_npy_file) np.save( avail_embed_npy_file, np.array([e for e in qa_embedding.values()], dtype=np.float32)) else: inverse_vocab = du.json_load(avail_embed_file) vocab = {k: i for i, k in enumerate(inverse_vocab)} else: _, vocab, inverse_vocab = build_vocab(vocab_counter) # encode sentences tokenize_qa_test, _ = tokenize_sentences(total_qa['tests'], video_subtitle) tokenize_qa_val, _ = tokenize_sentences(total_qa['val'], video_subtitle) encode_sub = encode_subtitles(video_subtitle, vocab, shot_boundary, subtitle_shot) encode_qa_train = encode_sentences(tokenize_qa_train, vocab) encode_qa_test = encode_sentences(tokenize_qa_test, vocab) encode_qa_val = encode_sentences(tokenize_qa_val, vocab) tokenize_qa = { 'tokenize_qa_train': tokenize_qa_train, 'tokenize_qa_test': tokenize_qa_test, 'tokenize_qa_val': tokenize_qa_val, } encode_qa = { 'encode_qa_train': encode_qa_train, 'encode_qa_test': encode_qa_test, 'encode_qa_val': encode_qa_val, } vocab_all = { 'vocab': vocab, 'inverse_vocab': inverse_vocab, } fu.safe_remove(total_qa_file_name) fu.safe_remove(tokenize_file_name) fu.safe_remove(encode_file_name) fu.safe_remove(all_vocab_file_name) fu.safe_remove(config.encode_subtitle_file) du.json_dump(total_qa, total_qa_file_name) du.json_dump(tokenize_qa, tokenize_file_name) du.json_dump(encode_qa, encode_file_name) du.json_dump(vocab_all, all_vocab_file_name) du.json_dump(encode_sub, config.encode_subtitle_file)
def save(self): du.json_dump(self.args, self.param_file)
def process(args): """ Process word embedding. We split each words into n-gram character tokens, and save the intermediate file for later training. :param args: named tuple, arguments :return: None """ embedding_keys, embedding_vector = load_embedding_vec(cp.target) fu.block_print(['%s\'s # of embedding: %d' % (cp.target, len(embedding_keys)), '%s\'s shape of embedding vec: %s' % (cp.target, str(embedding_vector.shape))]) print('Length set of words:\n', set([len(k) for k in embedding_keys])) embedding_keys, embedding_vector = filter_stat(embedding_keys, embedding_vector, cp.max_length) counter_1gram = Counter() counter_3gram = Counter() counter_6gram = Counter() size_set = set() gram_embedding_keys = [[] for _ in range(len(embedding_keys))] max_size = 0 # Update counter and divide each word to n-gram. for idx, k in enumerate(tqdm(embedding_keys, desc='Counting')): counter_1gram.update(k) gram_embedding_keys[idx].extend(k) k = '<' + k + '>' three_gram = [k[i:i + 3] for i in range(len(k) - 2)] counter_3gram.update(three_gram) gram_embedding_keys[idx].extend(three_gram) six_gram = [k[i:i + 6] for i in range(len(k) - 5)] counter_6gram.update(six_gram) gram_embedding_keys[idx].extend(six_gram) size_set.add(len(gram_embedding_keys[idx])) if max_size < len(gram_embedding_keys[idx]): max_size = len(gram_embedding_keys[idx]) print('Max size of tokens:', max_size) print('Size set of tokens:', size_set) print('Number of grams:\n', '1-gram:', len(counter_1gram), '3-gram:', len(counter_3gram), '6-gram:', len(counter_6gram)) if not args.debug: du.json_dump({'1': counter_1gram, '3': counter_3gram, '6': counter_6gram}, cp.gram_counter_file) vocab = list(counter_1gram) + list(counter_3gram) + list(counter_6gram) + [UNK] du.json_dump(vocab, cp.gram_vocab_file) gtoi = {gram: idx for idx, gram in enumerate(vocab)} encoded_embedding_keys = np.ones((len(embedding_keys), max_size), dtype=np.int64) * (len(gtoi) - 1) for idx, k in enumerate(tqdm(gram_embedding_keys, desc='Encoding')): encoded_embedding_keys[idx, :len(k)] = [gtoi[gram] for gram in k] print(encoded_embedding_keys[:5]) assert len(encoded_embedding_keys) == len(embedding_vector), \ 'First dimensions of encoded keys and vectors are not matched.' start_time = time.time() fu.safe_remove(cp.encode_embedding_key_file) fu.safe_remove(cp.encode_embedding_vec_file) np.save(cp.encode_embedding_key_file, encoded_embedding_keys) np.save(cp.encode_embedding_vec_file, embedding_vector) print('Saveing processed data with %.3f s' % (time.time() - start_time))