Example #1
0
def gen_pairwise():
    train_pointwise = os.path.join(args.data_dir, 'train.prep.pointwise')
    test_pointwise = os.path.join(args.data_dir, 'test.prep.pointwise')
    for fn in [train_pointwise, test_pointwise]:
        fn_out = fn.rsplit('.', 1)[0] + '.pairwise'
        samples = load_train_test_file(fn,
                                       file_format=args.format,
                                       reverse=args.reverse)
        samples_gb_q = groupby(samples, lambda x: x[0])
        with open(fn_out, 'w') as fout:
            all_pointwise = 0
            all_pairwise = 0
            for q, q_samples in samples_gb_q:
                q_samples = list(q_samples)
                count = 0
                for s1 in q_samples:
                    for s2 in q_samples:
                        if s1[2] > s2[2]:
                            count += 1
                            fout.write('{}\t{}\t{}\t{}\n'.format(
                                s1[0], s1[1], s2[1], s1[2] - s2[2]))
                #print('query {}, #pointwise {}, #pairwise {}, ratio {}'.format(
                #  q, len(q_samples), count, count/(len(q_samples) or 1)))
                all_pointwise += len(q_samples)
                all_pairwise += count
            print('from {} samples to {} samples in {}'.format(
                all_pointwise, all_pairwise, fn_out))
Example #2
0
File: prep.py Project: jzbjyb/rri
def get_query_doc_ids(filepath):
    query_ids, doc_ids = set(), set()
    samples = load_train_test_file(filepath)
    query_ids |= set([s[0] for s in samples])
    doc_ids |= set([s[1] for s in samples])
    if filepath.endswith('pairwise'):
        doc_ids |= set(([s[2] for s in samples]))
    return query_ids, doc_ids
Example #3
0
File: prep.py Project: jzbjyb/rri
def filter_samples(filepath_old, filepath_new, filter_query, filter_doc):
    samples = load_train_test_file(filepath_old)
    if filepath_old.endswith('pointwise'):
        samples = [
            s for s in samples
            if s[0] not in filter_query and s[1] not in filter_doc
        ]
    elif filepath_old.endswith('pairwise'):
        samples = [
            s for s in samples if s[0] not in filter_query
            and s[1] not in filter_doc and s[2] not in filter_doc
        ]
    save_train_test_file(samples, filepath_new)
Example #4
0
def drop_negative():
    keep = 0.15
    train_filename = os.path.join(args.data_dir,
                                  'train.prep.{}'.format(args.paradigm))
    out_filename = os.path.join(
        args.data_dir, 'train.prep.neg{}.{}'.format(keep, args.paradigm))
    samples = load_train_test_file(train_filename,
                                   file_format=args.format,
                                   reverse=args.reverse)
    save_train_test_file(
        [s for s in samples if (s[-1] > 0) or (random.random() <= keep)],
        out_filename,
        file_format=args.format)
Example #5
0
File: prep.py Project: jzbjyb/rri
def gen_pairwise():
    train_pointwise = os.path.join(args.data_dir, 'train.prep.pointwise')
    test_pointwise = os.path.join(args.data_dir, 'test.prep.pointwise')
    for fn in [train_pointwise, test_pointwise]:
        fn_out = fn.rsplit('.', 1)[0] + '.pairwise'
        samples = load_train_test_file(fn,
                                       file_format=args.format,
                                       reverse=args.reverse)
        samples_gb_q = groupby(samples, lambda x: x[0])
        with open(fn_out, 'w') as fout:
            for q, q_samples in samples_gb_q:
                q_samples = list(q_samples)
                for s1 in q_samples:
                    for s2 in q_samples:
                        if s1[2] > s2[2]:
                            fout.write('{}\t{}\t{}\t{}\n'.format(
                                s1[0], s1[1], s2[1], s1[2] - s2[2]))
Example #6
0
def data_assemble(filepath,
                  query_raw,
                  doc_raw,
                  max_q_len,
                  max_d_len,
                  relevance_mapper=None):
    relevance_mapper = relevance_mapper or (lambda x: x)
    samples = load_train_test_file(filepath,
                                   file_format=args.format,
                                   reverse=args.reverse)
    samples_gb_q = groupby(samples, lambda x: x[0])  # queries should be sorted
    X = []
    y = []

    def batcher(X,
                y=None,
                batch_size=128,
                use_permutation=True,
                batch_num=None):
        rb = batch_size
        result = {
            'qid': [],
            'docid': [],
            'qd_size': [],
            'relevance': [],
            'query': [],
            'doc': [],
        }
        query_ind = 0
        doc_ind = 0
        total_batch_num = 0
        if use_permutation:
            # permutation wrt query
            perm = np.random.permutation(len(X))
        else:
            perm = list(range(len(X)))
        start_time = time.time()
        while query_ind < len(X):
            q_x = X[perm[query_ind]]
            q_y = y[perm[query_ind]] if y != None else None
            remain_n_sample = len(q_x['query']) - doc_ind
            take_n_sample = min(remain_n_sample, rb)
            for d in range(doc_ind, doc_ind + take_n_sample):
                result['qid'].append(q_x['qid'][d])
                result['docid'].append(q_x['docid'][d])
                result['qd_size'].append(q_x['qd_size'][d])
                if q_y != None:
                    result['relevance'].append(q_y['relevance'][d])
                result['query'].append(q_x['query'][d])
                result['doc'].append(q_x['doc'][d])
            rb -= take_n_sample
            doc_ind += take_n_sample
            if rb > 0 or doc_ind >= len(q_x['query']):
                query_ind += 1
                doc_ind = 0
            if rb == 0 or (len(result['qd_size']) > 0 and query_ind >= len(X)):
                # return batch
                yield_result = {
                    'qd_size': np.array(result['qd_size'], dtype=np.int32),
                }
                if q_y != None:
                    yield_result['relevance'] = np.array(result['relevance'],
                                                         dtype=np.int32)
                yield_result['query'] = data_pad(
                    result['query'], np.max(yield_result['qd_size'][:, 0]),
                    np.int32)
                yield_result['doc'] = data_pad(
                    result['doc'], np.max(yield_result['qd_size'][:, 1]),
                    np.int32)
                yield_result['qid'] = np.array(result['qid'], dtype=str)
                yield_result['docid'] = np.array(result['docid'], dtype=str)
                #print('qid: {}'.format(list(zip(range(len(result['qid'])), result['qid']))))
                #print('docid: {}'.format(list(zip(range(len(result['docid'])), result['docid']))))
                total_batch_num += 1
                yield yield_result, time.time() - start_time
                start_time = time.time()
                if batch_num and total_batch_num >= batch_num:
                    # end the batcher without traverse all the samples
                    break
                rb = batch_size
                result = {
                    'qid': [],
                    'docid': [],
                    'qd_size': [],
                    'relevance': [],
                    'query': [],
                    'doc': [],
                }

    if filepath.endswith('pointwise'):
        for q, q_samples in samples_gb_q:
            q_x = {
                'query': [],
                'doc': [],
                'qd_size': [],
                'max_q_len': max_q_len,
                'max_d_len': max_d_len,
                'qid': [],
                'docid': [],
            }
            q_y = {
                'relevance': [],
            }
            for s in q_samples:
                # use max_q_len and max_d_len to filter the queries and documents
                qm = query_raw[s[0]][:max_q_len]
                dm = doc_raw[s[1]][:max_d_len]
                q_x['query'].append(qm)
                q_x['doc'].append(dm)
                q_x['qd_size'].append([len(qm), len(dm)])
                q_x['qid'].append(s[0])
                q_x['docid'].append(s[1])
                q_y['relevance'].append(relevance_mapper(s[2]))
            X.append(q_x)
            y.append(q_y)
        return X, y, batcher
    elif filepath.endswith('pairwise'):
        for q, q_samples in samples_gb_q:
            q_x = {
                'query': [],
                'doc': [],
                'qd_size': [],
                'max_q_len': max_q_len,
                'max_d_len': max_d_len,
                'qid': [],
                'docid': [],
            }
            q_y = {
                'relevance': [],
            }
            for s in q_samples:
                # use max_q_len and max_d_len to filter the queries and documents
                if s[3] == 0:
                    # only consider pairs with difference
                    continue
                qm = query_raw[s[0]][:max_q_len]
                dm1 = doc_raw[s[1]][:max_d_len]
                dm2 = doc_raw[s[2]][:max_d_len]
                q_x['query'].append(qm)
                q_x['query'].append(qm)
                q_x['qid'].append(s[0])
                q_x['qid'].append(s[0])
                if s[3] < 0:
                    # only use positive pairs
                    dm = dm1
                    dm1 = dm2
                    dm2 = dm
                    q_x['docid'].append(s[2])
                    q_x['docid'].append(s[1])
                    q_y['relevance'].append(-s[2])
                    q_y['relevance'].append(-s[2])
                else:
                    q_x['docid'].append(s[1])
                    q_x['docid'].append(s[2])
                    q_y['relevance'].append(s[2])
                    q_y['relevance'].append(s[2])
                q_x['doc'].append(dm1)
                q_x['doc'].append(dm2)
                q_x['qd_size'].append([len(qm), len(dm1)])
                q_x['qd_size'].append([len(qm), len(dm2)])
            X.append(q_x)
            y.append(q_y)

            def pairwise_batcher(X,
                                 y=None,
                                 batch_size=128,
                                 use_permutation=True,
                                 batch_num=None):
                if batch_size % 2 != 0:
                    raise Exception(
                        'this batcher can\'t be used in pairwise approach')
                return batcher(X,
                               y=y,
                               batch_size=batch_size,
                               use_permutation=use_permutation,
                               batch_num=batch_num)

        return X, y, pairwise_batcher
    else:
        raise NotImplementedError()
Example #7
0
File: prep.py Project: jzbjyb/rri
def gen_tfrecord():
    # number of tfrecord file to save
    num_shards = 1
    print('use {} shards'.format(num_shards))

    def _pick_output_shard():
        return random.randint(0, num_shards - 1)

    print('load text file ...')
    doc_file = os.path.join(args.data_dir, 'docs.prep')
    if args.format == 'ir':
        query_file = os.path.join(args.data_dir, 'query.prep')
    doc_raw = load_prep_file(doc_file, file_format=args.format)
    if args.format == 'ir':
        query_raw = load_prep_file(query_file, file_format=args.format)
    else:
        query_raw = doc_raw
    train_filename = os.path.join(args.data_dir,
                                  'train.prep.{}'.format(args.paradigm))
    test_filename = os.path.join(args.data_dir,
                                 'test.prep.{}'.format(args.paradigm))
    for fn in [train_filename, test_filename]:
        print('convert "{}" ...'.format(fn))
        output_file = fn + '.tfrecord'
        writers = []
        for i in range(num_shards):
            writers.append(
                tf.python_io.TFRecordWriter('%s-%03i-of-%03i' %
                                            (output_file, i, num_shards)))
        samples = load_train_test_file(fn,
                                       file_format=args.format,
                                       reverse=args.reverse)
        for i, sample in enumerate(samples):
            if i % 100000 == 0:
                print('{}w'.format(i // 10000))
            features = {}
            if args.paradigm == 'pointwise':
                q, d, r = sample
                qb = q.encode('utf-8')
                db = d.encode('utf-8')
                features['docid'] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[db]))
                features['doc'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=doc_raw[d]))
                features['doclen'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[len(doc_raw[d])]))
            elif args.paradigm == 'pairwise':
                q, d1, d2, r = sample
                qb = q.encode('utf-8')
                d1b = d1.encode('utf-8')
                d2b = d1.encode('utf-8')
                features['docid1'] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[d1b]))
                features['docid2'] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[d2b]))
                features['doc1'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=doc_raw[d1]))
                features['doc2'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=doc_raw[d2]))
                features['doc1len'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[len(doc_raw[d1])]))
                features['doc2len'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[len(doc_raw[d2])]))
            features['qid'] = tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[qb]))
            features['query'] = tf.train.Feature(int64_list=tf.train.Int64List(
                value=query_raw[q]))
            features['label'] = tf.train.Feature(float_list=tf.train.FloatList(
                value=[r]))
            features['qlen'] = tf.train.Feature(int64_list=tf.train.Int64List(
                value=[len(query_raw[q])]))
            f = tf.train.Features(feature=features)
            example = tf.train.Example(features=f)
            # randomly choose a shard to save the example
            writers[_pick_output_shard()].write(example.SerializeToString())