def compute_training_pairs_w_queries_variations(fold_idx, coll, n_iter_per_query, gt_file, dbn, qbn, ftt_model, iwi, wi): model_name = 'qn_rd_nrd_pairs_w2v_gk_' + str(fold_idx) + '_' + str(coll) + '_' + str(n_iter_per_query) if not os.path.isfile(model_name): rd_b_qry = {} nrd_by_qry = {} for line in open(gt_file): data = line.split() qname = data[0].strip() dname = data[2].strip() if dname not in dbn.keys(): continue rj = int(data[3].strip()) if qname not in rd_b_qry.keys(): rd_b_qry[qname] = [] nrd_by_qry[qname] = [] if rj > 0: rd_b_qry[qname].append(dname) else: nrd_by_qry[qname].append(dname) test_q_names = list(qbn.keys()) np.random.shuffle(test_q_names) qn_rd_nrd_pairs = [] for qn in test_q_names: if qn not in rd_b_qry.keys(): continue # add training examples with original query: encoded_q = qbn[qn] tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True) tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True) for i in range(n_iter_per_query): qn_rd_nrd_pairs.append((encoded_q, dbn[tmp_rdocs[i]], dbn[tmp_nrdocs[i]])) print('original query: ' + ' '.join([iwi[w] for w in encoded_q])) # add extra training examples for i in range(len(encoded_q)): encoded_q_variation = encoded_q curr_q_word = iwi[encoded_q[i]] similar_words = get_synonyms(curr_q_word, ftt_model) for sw in similar_words: sw = util.stem(sw) if sw in wi.keys() and curr_q_word != sw: print('word = ' + curr_q_word + ', substitute = ' + sw) encoded_q_variation[i] = wi[sw] print('alternative query: ' + ' '.join([iwi[w] for w in encoded_q_variation])) tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True) tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True) for j in range(n_iter_per_query): qn_rd_nrd_pairs.append((encoded_q_variation, dbn[tmp_rdocs[j]], dbn[tmp_nrdocs[j]])) np.random.shuffle(qn_rd_nrd_pairs) util.save_model(qn_rd_nrd_pairs, model_name) else: qn_rd_nrd_pairs = util.load_model(model_name) return qn_rd_nrd_pairs
def encode_collection_with_stemming(text_by_name_p, word_dict_path, w2v_model_path, encoded_out_folder, wi=None, word_embeddings_matrix=None): text_by_name = {} print('reading files in folder') for filename in tqdm(os.listdir(text_by_name_p)): fp = os.path.join(text_by_name_p, filename) if os.path.isfile(fp): text_by_name[filename.split(r'.')[0]] = ' '.join( open(fp, 'r').readlines()) # initialize embeddings matrix if word_embeddings_matrix is None: # read and adapt word index if wi is None: wi = {} wids_to_merge = {} for line in tqdm(open(word_dict_path)): data = line.split() word_stemmed = util.stem(data[0].strip()) wid = int(data[1].strip()) if word_stemmed not in wi.keys(): wi[word_stemmed] = len(wi) wids_to_merge[word_stemmed] = [wid] else: wids_to_merge[word_stemmed].append(wid) we_size = 50 word_embeddings_matrix = np.float32( np.random.uniform(-0.02, 0.02, [len(wi) + 1, we_size])) padding_value = np.zeros(we_size) word_embeddings_matrix[word_embeddings_matrix.shape[0] - 1] = padding_value w2v_model = load_w2v_we(w2v_model_path) for k, v in wi.items(): we = np.zeros(we_size) summed_something = False for wid in wids_to_merge[k]: if wid in w2v_model.keys(): we = np.sum((we, w2v_model[wid]), axis=0) summed_something = True if summed_something: we = we / np.linalg.norm(we) # normalize new word embedding word_embeddings_matrix[v] = we encoded_docs_by_name = {} sw = load_indri_stopwords() print('encoding data') for dn, dc in tqdm(text_by_name.items()): td = util.tokenize(dc, stemming=True, stoplist=sw) encoded_doc = [wi[w] for w in td if w in wi.keys()] util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn)) encoded_docs_by_name[dn] = encoded_doc return encoded_docs_by_name, wi, word_embeddings_matrix
def compute_docs_to_rerank_by_query(queries_names, qbn, dbn, iwi, ii, fasttext_vec_model): docs_to_rerank_by_qry = {} for qn in tqdm(queries_names): q = qbn[qn] for qw in q: query_word = iwi[qw] if query_word not in fasttext_vec_model.wv.vocab: continue # here I try to find the most similar terms to the stemmed word similar_words = [w[0] for w in fasttext_vec_model.most_similar(positive=[query_word], topn=10)] for w in similar_words: # stem the most similar words found in the model w = util.stem(w) if w in ii.keys(): if qn not in docs_to_rerank_by_qry.keys(): docs_to_rerank_by_qry[qn] = [] docs_to_rerank_by_qry[qn].extend([pl[0] for pl in ii[w]]) return docs_to_rerank_by_qry
import argparse import os from util import os_command, stem if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input', help='Input video.') parser.add_argument('save_dir', help='Save directory.') parser.add_argument( '--st_pos', default='00:00:10', help='Starting position in format hh:mm:ss[.xxx]. [00:00:10]') parser.add_argument('--fps', default='5', help='Frame per second. [5]') parser.add_argument('--max_frames', default=200, type=int, help='Maxium number of frames. [200]') args = parser.parse_args() if not os.path.isdir(args.save_dir): os.mkdir(args.save_dir) vid_stem = stem(args.input) command = [ 'ffmpeg', '-i', args.input, '-r', args.fps, '-ss', args.st_pos, '-f', 'image2', '-vf', 'scale=-1:500', '-q:v', '2' ] if args.max_frames > 0: command += ['-vframes', args.max_frames] command += [os.path.join(args.save_dir, vid_stem + '_%04d.JPEG')] os_command(command)
parser.add_argument('--ja_file', dest='ja_file', action='store', type=str, help='tokenized japanese file') parser.add_argument('--output', dest='output', action='store', type=str, help='outpu dict file') parser.add_argument('--stem', dest='stem', action='store_true', help='stem english') parser.add_argument('--nostop', dest='nostop', action='store_true', default=False, help='remove english stopwords') args = parser.parse_args() stopwords = [] if args.nostop: stopwords = get_stopwords() nerr = 0 with open(args.en_file, 'r') as fin_en, codecs.open(args.ja_file, 'r', encoding='utf-8') as fin_ja, codecs.open(args.output, 'w', encoding='utf-8') as fout: for en, ja in zip(fin_en, fin_ja): en = remove_punct(en.strip()).strip().split() if len(en) == 1: if args.stem: en = ' '.join(stem(en)) else: en = ' '.join(en) if en in stopwords: continue ja = remove_punct(ja.strip()).strip().split() ja = ' '.join(ja) if re.match(r'\w+', ja): continue try: fout.write('%s @ %s\n' % (ja, en)) #fout.write('%s <> %s\n' % (en, ja)) except UnicodeDecodeError: nerr += 1 continue print 'errors:', nerr
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input', help='Input video.') parser.add_argument('save_dir', help='Save directory.') parser.add_argument('--st_pos', default='00:00:10', help='Starting position in format hh:mm:ss[.xxx]. [00:00:10]') parser.add_argument('--fps', default='5', help='Frame per second. [5]') parser.add_argument('--max_frames', default=200, type=int, help='Maxium number of frames. [200]') args = parser.parse_args() if not os.path.isdir(args.save_dir): os.mkdir(args.save_dir) vid_stem = stem(args.input) command = ['ffmpeg', '-i', args.input, '-r', args.fps, '-ss', args.st_pos, '-f', 'image2', '-vf', 'scale=-1:500', '-q:v', '2'] if args.max_frames > 0: command += ['-vframes', args.max_frames] command += [os.path.join(args.save_dir, vid_stem + '_%04d.JPEG')] os_command(command)