def gen_pairwise(): train_pointwise = os.path.join(args.data_dir, 'train.prep.pointwise') test_pointwise = os.path.join(args.data_dir, 'test.prep.pointwise') for fn in [train_pointwise, test_pointwise]: fn_out = fn.rsplit('.', 1)[0] + '.pairwise' samples = load_train_test_file(fn, file_format=args.format, reverse=args.reverse) samples_gb_q = groupby(samples, lambda x: x[0]) with open(fn_out, 'w') as fout: all_pointwise = 0 all_pairwise = 0 for q, q_samples in samples_gb_q: q_samples = list(q_samples) count = 0 for s1 in q_samples: for s2 in q_samples: if s1[2] > s2[2]: count += 1 fout.write('{}\t{}\t{}\t{}\n'.format( s1[0], s1[1], s2[1], s1[2] - s2[2])) #print('query {}, #pointwise {}, #pairwise {}, ratio {}'.format( # q, len(q_samples), count, count/(len(q_samples) or 1))) all_pointwise += len(q_samples) all_pairwise += count print('from {} samples to {} samples in {}'.format( all_pointwise, all_pairwise, fn_out))
def get_query_doc_ids(filepath): query_ids, doc_ids = set(), set() samples = load_train_test_file(filepath) query_ids |= set([s[0] for s in samples]) doc_ids |= set([s[1] for s in samples]) if filepath.endswith('pairwise'): doc_ids |= set(([s[2] for s in samples])) return query_ids, doc_ids
def filter_samples(filepath_old, filepath_new, filter_query, filter_doc): samples = load_train_test_file(filepath_old) if filepath_old.endswith('pointwise'): samples = [ s for s in samples if s[0] not in filter_query and s[1] not in filter_doc ] elif filepath_old.endswith('pairwise'): samples = [ s for s in samples if s[0] not in filter_query and s[1] not in filter_doc and s[2] not in filter_doc ] save_train_test_file(samples, filepath_new)
def drop_negative(): keep = 0.15 train_filename = os.path.join(args.data_dir, 'train.prep.{}'.format(args.paradigm)) out_filename = os.path.join( args.data_dir, 'train.prep.neg{}.{}'.format(keep, args.paradigm)) samples = load_train_test_file(train_filename, file_format=args.format, reverse=args.reverse) save_train_test_file( [s for s in samples if (s[-1] > 0) or (random.random() <= keep)], out_filename, file_format=args.format)
def gen_pairwise(): train_pointwise = os.path.join(args.data_dir, 'train.prep.pointwise') test_pointwise = os.path.join(args.data_dir, 'test.prep.pointwise') for fn in [train_pointwise, test_pointwise]: fn_out = fn.rsplit('.', 1)[0] + '.pairwise' samples = load_train_test_file(fn, file_format=args.format, reverse=args.reverse) samples_gb_q = groupby(samples, lambda x: x[0]) with open(fn_out, 'w') as fout: for q, q_samples in samples_gb_q: q_samples = list(q_samples) for s1 in q_samples: for s2 in q_samples: if s1[2] > s2[2]: fout.write('{}\t{}\t{}\t{}\n'.format( s1[0], s1[1], s2[1], s1[2] - s2[2]))
def data_assemble(filepath, query_raw, doc_raw, max_q_len, max_d_len, relevance_mapper=None): relevance_mapper = relevance_mapper or (lambda x: x) samples = load_train_test_file(filepath, file_format=args.format, reverse=args.reverse) samples_gb_q = groupby(samples, lambda x: x[0]) # queries should be sorted X = [] y = [] def batcher(X, y=None, batch_size=128, use_permutation=True, batch_num=None): rb = batch_size result = { 'qid': [], 'docid': [], 'qd_size': [], 'relevance': [], 'query': [], 'doc': [], } query_ind = 0 doc_ind = 0 total_batch_num = 0 if use_permutation: # permutation wrt query perm = np.random.permutation(len(X)) else: perm = list(range(len(X))) start_time = time.time() while query_ind < len(X): q_x = X[perm[query_ind]] q_y = y[perm[query_ind]] if y != None else None remain_n_sample = len(q_x['query']) - doc_ind take_n_sample = min(remain_n_sample, rb) for d in range(doc_ind, doc_ind + take_n_sample): result['qid'].append(q_x['qid'][d]) result['docid'].append(q_x['docid'][d]) result['qd_size'].append(q_x['qd_size'][d]) if q_y != None: result['relevance'].append(q_y['relevance'][d]) result['query'].append(q_x['query'][d]) result['doc'].append(q_x['doc'][d]) rb -= take_n_sample doc_ind += take_n_sample if rb > 0 or doc_ind >= len(q_x['query']): query_ind += 1 doc_ind = 0 if rb == 0 or (len(result['qd_size']) > 0 and query_ind >= len(X)): # return batch yield_result = { 'qd_size': np.array(result['qd_size'], dtype=np.int32), } if q_y != None: yield_result['relevance'] = np.array(result['relevance'], dtype=np.int32) yield_result['query'] = data_pad( result['query'], np.max(yield_result['qd_size'][:, 0]), np.int32) yield_result['doc'] = data_pad( result['doc'], np.max(yield_result['qd_size'][:, 1]), np.int32) yield_result['qid'] = np.array(result['qid'], dtype=str) yield_result['docid'] = np.array(result['docid'], dtype=str) #print('qid: {}'.format(list(zip(range(len(result['qid'])), result['qid'])))) #print('docid: {}'.format(list(zip(range(len(result['docid'])), result['docid'])))) total_batch_num += 1 yield yield_result, time.time() - start_time start_time = time.time() if batch_num and total_batch_num >= batch_num: # end the batcher without traverse all the samples break rb = batch_size result = { 'qid': [], 'docid': [], 'qd_size': [], 'relevance': [], 'query': [], 'doc': [], } if filepath.endswith('pointwise'): for q, q_samples in samples_gb_q: q_x = { 'query': [], 'doc': [], 'qd_size': [], 'max_q_len': max_q_len, 'max_d_len': max_d_len, 'qid': [], 'docid': [], } q_y = { 'relevance': [], } for s in q_samples: # use max_q_len and max_d_len to filter the queries and documents qm = query_raw[s[0]][:max_q_len] dm = doc_raw[s[1]][:max_d_len] q_x['query'].append(qm) q_x['doc'].append(dm) q_x['qd_size'].append([len(qm), len(dm)]) q_x['qid'].append(s[0]) q_x['docid'].append(s[1]) q_y['relevance'].append(relevance_mapper(s[2])) X.append(q_x) y.append(q_y) return X, y, batcher elif filepath.endswith('pairwise'): for q, q_samples in samples_gb_q: q_x = { 'query': [], 'doc': [], 'qd_size': [], 'max_q_len': max_q_len, 'max_d_len': max_d_len, 'qid': [], 'docid': [], } q_y = { 'relevance': [], } for s in q_samples: # use max_q_len and max_d_len to filter the queries and documents if s[3] == 0: # only consider pairs with difference continue qm = query_raw[s[0]][:max_q_len] dm1 = doc_raw[s[1]][:max_d_len] dm2 = doc_raw[s[2]][:max_d_len] q_x['query'].append(qm) q_x['query'].append(qm) q_x['qid'].append(s[0]) q_x['qid'].append(s[0]) if s[3] < 0: # only use positive pairs dm = dm1 dm1 = dm2 dm2 = dm q_x['docid'].append(s[2]) q_x['docid'].append(s[1]) q_y['relevance'].append(-s[2]) q_y['relevance'].append(-s[2]) else: q_x['docid'].append(s[1]) q_x['docid'].append(s[2]) q_y['relevance'].append(s[2]) q_y['relevance'].append(s[2]) q_x['doc'].append(dm1) q_x['doc'].append(dm2) q_x['qd_size'].append([len(qm), len(dm1)]) q_x['qd_size'].append([len(qm), len(dm2)]) X.append(q_x) y.append(q_y) def pairwise_batcher(X, y=None, batch_size=128, use_permutation=True, batch_num=None): if batch_size % 2 != 0: raise Exception( 'this batcher can\'t be used in pairwise approach') return batcher(X, y=y, batch_size=batch_size, use_permutation=use_permutation, batch_num=batch_num) return X, y, pairwise_batcher else: raise NotImplementedError()
def gen_tfrecord(): # number of tfrecord file to save num_shards = 1 print('use {} shards'.format(num_shards)) def _pick_output_shard(): return random.randint(0, num_shards - 1) print('load text file ...') doc_file = os.path.join(args.data_dir, 'docs.prep') if args.format == 'ir': query_file = os.path.join(args.data_dir, 'query.prep') doc_raw = load_prep_file(doc_file, file_format=args.format) if args.format == 'ir': query_raw = load_prep_file(query_file, file_format=args.format) else: query_raw = doc_raw train_filename = os.path.join(args.data_dir, 'train.prep.{}'.format(args.paradigm)) test_filename = os.path.join(args.data_dir, 'test.prep.{}'.format(args.paradigm)) for fn in [train_filename, test_filename]: print('convert "{}" ...'.format(fn)) output_file = fn + '.tfrecord' writers = [] for i in range(num_shards): writers.append( tf.python_io.TFRecordWriter('%s-%03i-of-%03i' % (output_file, i, num_shards))) samples = load_train_test_file(fn, file_format=args.format, reverse=args.reverse) for i, sample in enumerate(samples): if i % 100000 == 0: print('{}w'.format(i // 10000)) features = {} if args.paradigm == 'pointwise': q, d, r = sample qb = q.encode('utf-8') db = d.encode('utf-8') features['docid'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[db])) features['doc'] = tf.train.Feature( int64_list=tf.train.Int64List(value=doc_raw[d])) features['doclen'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[len(doc_raw[d])])) elif args.paradigm == 'pairwise': q, d1, d2, r = sample qb = q.encode('utf-8') d1b = d1.encode('utf-8') d2b = d1.encode('utf-8') features['docid1'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[d1b])) features['docid2'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[d2b])) features['doc1'] = tf.train.Feature( int64_list=tf.train.Int64List(value=doc_raw[d1])) features['doc2'] = tf.train.Feature( int64_list=tf.train.Int64List(value=doc_raw[d2])) features['doc1len'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[len(doc_raw[d1])])) features['doc2len'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[len(doc_raw[d2])])) features['qid'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[qb])) features['query'] = tf.train.Feature(int64_list=tf.train.Int64List( value=query_raw[q])) features['label'] = tf.train.Feature(float_list=tf.train.FloatList( value=[r])) features['qlen'] = tf.train.Feature(int64_list=tf.train.Int64List( value=[len(query_raw[q])])) f = tf.train.Features(feature=features) example = tf.train.Example(features=f) # randomly choose a shard to save the example writers[_pick_output_shard()].write(example.SerializeToString())