Ejemplo n.º 1
0
def create_instance_pointwise(tokenizer, max_seq_length, qid, docno, query,
                              doc, label):
    query = tokenization.convert_to_unicode(query)
    doc = tokenization.convert_to_unicode(doc)
    passages = get_passages(doc, FLAGS.plen, FLAGS.overlap)
    if len(passages) == 0:
        tf.logging.warn("Passage length is 0 in qid {} docno {}".format(
            qid, docno))

    query = tokenization.convert_to_bert_input(text=query,
                                               max_seq_length=64,
                                               tokenizer=tokenizer,
                                               add_cls=True,
                                               convert_to_id=False)
    passages = [
        tokenization.convert_to_bert_input(
            text=p,
            max_seq_length=max_seq_length - len(query),
            tokenizer=tokenizer,
            add_cls=False,
            convert_to_id=False) for p in passages
    ]
    instance = PointwiseInstance(exampleid="{}-{}".format(qid, docno),
                                 tokens_a=query,
                                 tokens_b_list=passages,
                                 relation_label=label)

    return instance
Ejemplo n.º 2
0
def convert_dataset(queries, passages, qrels, tokenizer, fold, split):
    """ Convert <query, passage> pairs to TFRecord. """
    main_path = os.path.join(FLAGS.output_path, "fold-" + str(fold))
    if not tf.gfile.Exists(main_path):
        tf.gfile.MakeDirs(main_path)
    out_query_passage = os.path.join(
        main_path, '{}_query_maxp_{}.tf'.format(FLAGS.dataset, split))
    with tf.python_io.TFRecordWriter(out_query_passage) as writer, \
            tf.gfile.Open(os.path.join(FLAGS.passage_path, "fold-" + str(fold),
                                       '{}_query_passage_{}_top1.tsv'.format(FLAGS.dataset, split)), 'r') as qp_file:
        for i, line in enumerate(qp_file):
            qid, Q0, doc_id, pid, rank, score, run_name = line.split("\t")
            query = queries[qid]

            query = tokenization.convert_to_unicode(query)
            query_tokens = tokenization.convert_to_bert_input(
                text=query,
                max_seq_length=FLAGS.max_query_length,
                tokenizer=tokenizer,
                add_cls=True,
                add_sep=True)

            query_token_ids_tf = tf.train.Feature(
                int64_list=tf.train.Int64List(value=query_tokens))

            passage_content = passages[pid]

            passage_tokens = tokenization.convert_to_bert_input(
                text=tokenization.convert_to_unicode(passage_content),
                max_seq_length=FLAGS.max_passage_length,
                tokenizer=tokenizer,
                add_cls=False,
                add_sep=True)

            passage_token_ids_tf = tf.train.Feature(
                int64_list=tf.train.Int64List(value=passage_tokens))

            label = 1 if doc_id in qrels[qid] else 0

            labels_tf = tf.train.Feature(int64_list=tf.train.Int64List(
                value=[label]))

            features = tf.train.Features(
                feature={
                    'query_token_ids': query_token_ids_tf,
                    'piece_token_ids': passage_token_ids_tf,
                    'label': labels_tf,
                })

            example = tf.train.Example(features=features)
            writer.write(example.SerializeToString())

            if (i + 1) % 1000 == 0:
                print("process {} examples".format(i + 1))
Ejemplo n.º 3
0
def convert_query_example(query_id, query, max_seq_length, tokenizer):

    query = tokenization.convert_to_unicode(query)

    query_tokens = tokenization.convert_to_bert_input(
        text=query,
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        add_cls=True,
        add_sep=True)

    return query_tokens
Ejemplo n.º 4
0
def convert_single_piece_example(ex_index, query_token_ids, piece,
                                 max_seq_length, tokenizer):

    piece_tokens = tokenization.convert_to_bert_input(
        text=tokenization.convert_to_unicode(piece),
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        add_cls=False,
        add_sep=True)

    features = {
        'query_token_ids': query_token_ids,
        'label': 0,
        'piece_token_ids': piece_tokens
    }

    return features
Ejemplo n.º 5
0
def convert_dataset(main_path, data, collection, tokenizer, split=""):
    """ Split a document into passages/chunks and convert <query, passage/chunk> pairs to TFRecord."""
    suffix = ""
    if split != "":
        suffix = "_" + split

    if not tf.gfile.Exists(main_path):
        tf.gfile.MakeDirs(main_path)
    id_file = tf.gfile.Open(
        os.path.join(main_path,
                     'query_{}_ids{}.txt'.format(FLAGS.task, suffix)), 'w')
    text_file = tf.gfile.Open(
        os.path.join(main_path, '{}_id_text{}.txt').format(FLAGS.task, suffix),
        'w')
    out_tf_path = os.path.join(main_path,
                               'query_{}{}.tf'.format(FLAGS.task, suffix))
    id_set = set()
    with tf.python_io.TFRecordWriter(out_tf_path) as writer:
        for i, query_id in enumerate(data):
            query, qrels, doc_ids = data[query_id]

            query = tokenization.convert_to_unicode(query)

            query_tokens = tokenization.convert_to_bert_input(
                text=query,
                max_seq_length=FLAGS.max_query_length,
                tokenizer=tokenizer,
                add_cls=True,
                add_sep=True)

            query_token_ids_tf = tf.train.Feature(
                int64_list=tf.train.Int64List(value=query_tokens))

            # here doc_depth is the top_docs_num in chunk file
            doc_ids = doc_ids[:FLAGS.doc_depth]

            if i + 1 % 1000 == 0:
                print("process {} queries".format(i))

            for doc_id in doc_ids:

                title = None
                if FLAGS.dataset == 'robust04' and FLAGS.task == 'passage':
                    title, body = collection[doc_id].split("\t")
                    title = " ".join(
                        title.split(" ")
                        [:FLAGS.max_title_length]).strip()  # truncate title
                    if title == '' or title == '.':  # if title is invalid
                        title = None
                else:
                    body = collection[doc_id]

                pieces = get_pieces(body, FLAGS.window_size, FLAGS.stride)

                for j, piece in enumerate(pieces):
                    piece_id = doc_id + "_{}-{}".format(FLAGS.task, j)

                    id_file.write('{}\t{}\n'.format(query_id, piece_id))

                    if title:
                        piece = title + ' ' + piece

                    if FLAGS.task == "passage":
                        max_piece_length = FLAGS.max_passage_length
                    else:
                        max_piece_length = FLAGS.max_query_length

                    piece_tokens = tokenization.convert_to_bert_input(
                        text=tokenization.convert_to_unicode(piece),
                        max_seq_length=max_piece_length,
                        tokenizer=tokenizer,
                        add_cls=False,
                        add_sep=True)

                    if piece_id not in id_set:
                        id_set.add(piece_id)
                        text_file.write(piece_id + "\t" + piece + "\n")

                    piece_token_ids_tf = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=piece_tokens))

                    labels_tf = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[0]))  # fake label

                    features = tf.train.Features(
                        feature={
                            'query_token_ids': query_token_ids_tf,
                            'label': labels_tf,
                            'piece_token_ids': piece_token_ids_tf
                        })
                    example = tf.train.Example(features=features)
                    writer.write(example.SerializeToString())

            if i % 1000 == 0:
                print('wrote {} of {} queries'.format(i, len(data)))

    id_file.close()
    text_file.close()
Ejemplo n.º 6
0
def convert_dataset(data, passages, chunks, qc_scores, tokenizer, fold, split):
    """ Convert <chunk, passage> pairs to TFRecord."""
    output_path = os.path.join(FLAGS.output_path, "fold-" + str(fold),
                               "rerank-{0}_kc-{1}".format(FLAGS.rerank_num, FLAGS.kc), "data")
    if not tf.gfile.Exists(output_path):
        tf.gfile.MakeDirs(output_path)

    out_chunk_passage = os.path.join(output_path, 'chunk_passage_{0}.tf'.format(split))
    with tf.python_io.TFRecordWriter(out_chunk_passage) as writer, \
            tf.gfile.Open(os.path.join(output_path, 'chunk_passage_ids_{0}.txt'.format(split)),
                          'w') as chunk_passage_ids_file:
        qids = list(data.keys())
        if split == "train":
            random.shuffle(qids)
        for i, query_id in enumerate(qids):
            query, chunk_id_list, passage_ids, labels = data[query_id]
            pid_labels = list(zip(passage_ids, labels))
            pid_labels = pid_labels[:FLAGS.rerank_num]

            for pid, label in pid_labels:
                p_content = passages[pid]

                passage_tokens = tokenization.convert_to_bert_input(
                    text=tokenization.convert_to_unicode(p_content),
                    max_seq_length=FLAGS.max_passage_length,
                    tokenizer=tokenizer,
                    add_cls=False,
                    add_sep=True)

                passage_token_ids_tf = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=passage_tokens))

                labels_tf = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[label]))

                for chunk_id in chunk_id_list:
                    chunk_content = chunks[chunk_id]
                    qc_score = qc_scores[query_id][chunk_id]
                    query_tokens = tokenization.convert_to_bert_input(
                        text=tokenization.convert_to_unicode(chunk_content),
                        max_seq_length=FLAGS.max_query_length,
                        tokenizer=tokenizer,
                        add_cls=True,
                        add_sep=True)

                    query_token_ids_tf = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=query_tokens))

                    qc_score_tf = tf.train.Feature(
                        float_list=tf.train.FloatList(value=[qc_score])
                    )

                    chunk_passage_ids_file.write(
                        query_id + "\t" + chunk_id + "\t" + pid + "\t" + str(label) + "\t" + str(qc_score) + "\n")

                    features = tf.train.Features(feature={
                        'query_token_ids': query_token_ids_tf,
                        'piece_token_ids': passage_token_ids_tf,
                        'label': labels_tf,
                        'qc_score': qc_score_tf
                    })
                    example = tf.train.Example(features=features)
                    writer.write(example.SerializeToString())