def create_instance_pointwise(tokenizer, max_seq_length, qid, docno, query, doc, label): query = tokenization.convert_to_unicode(query) doc = tokenization.convert_to_unicode(doc) passages = get_passages(doc, FLAGS.plen, FLAGS.overlap) if len(passages) == 0: tf.logging.warn("Passage length is 0 in qid {} docno {}".format( qid, docno)) query = tokenization.convert_to_bert_input(text=query, max_seq_length=64, tokenizer=tokenizer, add_cls=True, convert_to_id=False) passages = [ tokenization.convert_to_bert_input( text=p, max_seq_length=max_seq_length - len(query), tokenizer=tokenizer, add_cls=False, convert_to_id=False) for p in passages ] instance = PointwiseInstance(exampleid="{}-{}".format(qid, docno), tokens_a=query, tokens_b_list=passages, relation_label=label) return instance
def convert_dataset(queries, passages, qrels, tokenizer, fold, split): """ Convert <query, passage> pairs to TFRecord. """ main_path = os.path.join(FLAGS.output_path, "fold-" + str(fold)) if not tf.gfile.Exists(main_path): tf.gfile.MakeDirs(main_path) out_query_passage = os.path.join( main_path, '{}_query_maxp_{}.tf'.format(FLAGS.dataset, split)) with tf.python_io.TFRecordWriter(out_query_passage) as writer, \ tf.gfile.Open(os.path.join(FLAGS.passage_path, "fold-" + str(fold), '{}_query_passage_{}_top1.tsv'.format(FLAGS.dataset, split)), 'r') as qp_file: for i, line in enumerate(qp_file): qid, Q0, doc_id, pid, rank, score, run_name = line.split("\t") query = queries[qid] query = tokenization.convert_to_unicode(query) query_tokens = tokenization.convert_to_bert_input( text=query, max_seq_length=FLAGS.max_query_length, tokenizer=tokenizer, add_cls=True, add_sep=True) query_token_ids_tf = tf.train.Feature( int64_list=tf.train.Int64List(value=query_tokens)) passage_content = passages[pid] passage_tokens = tokenization.convert_to_bert_input( text=tokenization.convert_to_unicode(passage_content), max_seq_length=FLAGS.max_passage_length, tokenizer=tokenizer, add_cls=False, add_sep=True) passage_token_ids_tf = tf.train.Feature( int64_list=tf.train.Int64List(value=passage_tokens)) label = 1 if doc_id in qrels[qid] else 0 labels_tf = tf.train.Feature(int64_list=tf.train.Int64List( value=[label])) features = tf.train.Features( feature={ 'query_token_ids': query_token_ids_tf, 'piece_token_ids': passage_token_ids_tf, 'label': labels_tf, }) example = tf.train.Example(features=features) writer.write(example.SerializeToString()) if (i + 1) % 1000 == 0: print("process {} examples".format(i + 1))
def convert_query_example(query_id, query, max_seq_length, tokenizer): query = tokenization.convert_to_unicode(query) query_tokens = tokenization.convert_to_bert_input( text=query, max_seq_length=max_seq_length, tokenizer=tokenizer, add_cls=True, add_sep=True) return query_tokens
def convert_single_piece_example(ex_index, query_token_ids, piece, max_seq_length, tokenizer): piece_tokens = tokenization.convert_to_bert_input( text=tokenization.convert_to_unicode(piece), max_seq_length=max_seq_length, tokenizer=tokenizer, add_cls=False, add_sep=True) features = { 'query_token_ids': query_token_ids, 'label': 0, 'piece_token_ids': piece_tokens } return features
def convert_dataset(main_path, data, collection, tokenizer, split=""): """ Split a document into passages/chunks and convert <query, passage/chunk> pairs to TFRecord.""" suffix = "" if split != "": suffix = "_" + split if not tf.gfile.Exists(main_path): tf.gfile.MakeDirs(main_path) id_file = tf.gfile.Open( os.path.join(main_path, 'query_{}_ids{}.txt'.format(FLAGS.task, suffix)), 'w') text_file = tf.gfile.Open( os.path.join(main_path, '{}_id_text{}.txt').format(FLAGS.task, suffix), 'w') out_tf_path = os.path.join(main_path, 'query_{}{}.tf'.format(FLAGS.task, suffix)) id_set = set() with tf.python_io.TFRecordWriter(out_tf_path) as writer: for i, query_id in enumerate(data): query, qrels, doc_ids = data[query_id] query = tokenization.convert_to_unicode(query) query_tokens = tokenization.convert_to_bert_input( text=query, max_seq_length=FLAGS.max_query_length, tokenizer=tokenizer, add_cls=True, add_sep=True) query_token_ids_tf = tf.train.Feature( int64_list=tf.train.Int64List(value=query_tokens)) # here doc_depth is the top_docs_num in chunk file doc_ids = doc_ids[:FLAGS.doc_depth] if i + 1 % 1000 == 0: print("process {} queries".format(i)) for doc_id in doc_ids: title = None if FLAGS.dataset == 'robust04' and FLAGS.task == 'passage': title, body = collection[doc_id].split("\t") title = " ".join( title.split(" ") [:FLAGS.max_title_length]).strip() # truncate title if title == '' or title == '.': # if title is invalid title = None else: body = collection[doc_id] pieces = get_pieces(body, FLAGS.window_size, FLAGS.stride) for j, piece in enumerate(pieces): piece_id = doc_id + "_{}-{}".format(FLAGS.task, j) id_file.write('{}\t{}\n'.format(query_id, piece_id)) if title: piece = title + ' ' + piece if FLAGS.task == "passage": max_piece_length = FLAGS.max_passage_length else: max_piece_length = FLAGS.max_query_length piece_tokens = tokenization.convert_to_bert_input( text=tokenization.convert_to_unicode(piece), max_seq_length=max_piece_length, tokenizer=tokenizer, add_cls=False, add_sep=True) if piece_id not in id_set: id_set.add(piece_id) text_file.write(piece_id + "\t" + piece + "\n") piece_token_ids_tf = tf.train.Feature( int64_list=tf.train.Int64List(value=piece_tokens)) labels_tf = tf.train.Feature( int64_list=tf.train.Int64List(value=[0])) # fake label features = tf.train.Features( feature={ 'query_token_ids': query_token_ids_tf, 'label': labels_tf, 'piece_token_ids': piece_token_ids_tf }) example = tf.train.Example(features=features) writer.write(example.SerializeToString()) if i % 1000 == 0: print('wrote {} of {} queries'.format(i, len(data))) id_file.close() text_file.close()
def convert_dataset(data, passages, chunks, qc_scores, tokenizer, fold, split): """ Convert <chunk, passage> pairs to TFRecord.""" output_path = os.path.join(FLAGS.output_path, "fold-" + str(fold), "rerank-{0}_kc-{1}".format(FLAGS.rerank_num, FLAGS.kc), "data") if not tf.gfile.Exists(output_path): tf.gfile.MakeDirs(output_path) out_chunk_passage = os.path.join(output_path, 'chunk_passage_{0}.tf'.format(split)) with tf.python_io.TFRecordWriter(out_chunk_passage) as writer, \ tf.gfile.Open(os.path.join(output_path, 'chunk_passage_ids_{0}.txt'.format(split)), 'w') as chunk_passage_ids_file: qids = list(data.keys()) if split == "train": random.shuffle(qids) for i, query_id in enumerate(qids): query, chunk_id_list, passage_ids, labels = data[query_id] pid_labels = list(zip(passage_ids, labels)) pid_labels = pid_labels[:FLAGS.rerank_num] for pid, label in pid_labels: p_content = passages[pid] passage_tokens = tokenization.convert_to_bert_input( text=tokenization.convert_to_unicode(p_content), max_seq_length=FLAGS.max_passage_length, tokenizer=tokenizer, add_cls=False, add_sep=True) passage_token_ids_tf = tf.train.Feature( int64_list=tf.train.Int64List(value=passage_tokens)) labels_tf = tf.train.Feature( int64_list=tf.train.Int64List(value=[label])) for chunk_id in chunk_id_list: chunk_content = chunks[chunk_id] qc_score = qc_scores[query_id][chunk_id] query_tokens = tokenization.convert_to_bert_input( text=tokenization.convert_to_unicode(chunk_content), max_seq_length=FLAGS.max_query_length, tokenizer=tokenizer, add_cls=True, add_sep=True) query_token_ids_tf = tf.train.Feature( int64_list=tf.train.Int64List(value=query_tokens)) qc_score_tf = tf.train.Feature( float_list=tf.train.FloatList(value=[qc_score]) ) chunk_passage_ids_file.write( query_id + "\t" + chunk_id + "\t" + pid + "\t" + str(label) + "\t" + str(qc_score) + "\n") features = tf.train.Features(feature={ 'query_token_ids': query_token_ids_tf, 'piece_token_ids': passage_token_ids_tf, 'label': labels_tf, 'qc_score': qc_score_tf }) example = tf.train.Example(features=features) writer.write(example.SerializeToString())