def load_qrels(qrels_path): if qrels_path is None: return None print_message("#> Loading qrels from", qrels_path, "...") qrels = {} with open(qrels_path, mode="r", encoding="utf-8") as f: for line in f: qid, x, pid, y = map(int, line.strip().split("\t")) assert x == 0 and y == 1 qrels[qid] = qrels.get(qid, []) qrels[qid].append(pid) assert all(len(qrels[qid]) == len(set(qrels[qid])) for qid in qrels) avg_positive = round(sum(len(qrels[qid]) for qid in qrels) / len(qrels), 2) print_message( "#> Loaded qrels for", len(qrels), "unique queries with", avg_positive, "positives per query on average.\n", ) return qrels
def load_colbert(args): print_message("#> Loading model checkpoint.") if args.dense: colbert = ColBERT.from_pretrained( "bert-base-uncased", query_maxlen=args.query_maxlen, doc_maxlen=args.doc_maxlen, dim=args.dim, similarity_metric=args.similarity, ) else: colbert = SparseColBERT.from_pretrained( "bert-base-uncased", query_maxlen=args.query_maxlen, doc_maxlen=args.doc_maxlen, k=args.k, n=args.n, use_nonneg=args.use_nonneg, normalize_sparse=args.normalize_sparse, similarity_metric=args.similarity, ) colbert = colbert.to(DEVICE) checkpoint = load_checkpoint(args.checkpoint, colbert) colbert.eval() print("\n") return colbert, checkpoint
def __init__(self, data_file): print_message("#> Training with the triples in", data_file, "...\n\n") if data_file.endswith(".parquet"): self.data = pd.read_parquet(data_file).values.tolist() else: self.data = pd.read_csv( data_file, sep="\t", names=["query", "pos", "neg"] ).values.tolist()
def load_colbert(args): print_message("#> Loading model checkpoint.") colbert = MultiBERT.from_pretrained('bert-base-uncased') colbert = colbert.to(DEVICE) checkpoint = load_checkpoint(args.checkpoint, colbert) colbert.eval() print('\n') return colbert, checkpoint
def train(args): colbert = ColBERT.from_pretrained('bert-base-uncased', query_maxlen=args.query_maxlen, doc_maxlen=args.doc_maxlen, dim=args.dim, similarity_metric=args.similarity) colbert = colbert.to(DEVICE) colbert.train() criterion = nn.CrossEntropyLoss() optimizer = AdamW(colbert.parameters(), lr=args.lr, eps=1e-8) optimizer.zero_grad() labels = torch.zeros(args.bsize, dtype=torch.long, device=DEVICE) reader = TrainReader(args.triples) train_loss = 0.0 for batch_idx in range(args.maxsteps): Batch = reader.get_minibatch(args.bsize) Batch = sorted(Batch, key=lambda x: max(len(x[1]), len(x[2]))) for B_idx in range(args.accumsteps): size = args.bsize // args.accumsteps B = Batch[B_idx * size:(B_idx + 1) * size] Q, D1, D2 = zip(*B) colbert_out = colbert(Q + Q, D1 + D2) colbert_out1, colbert_out2 = colbert_out[:len(Q)], colbert_out[ len(Q):] out = torch.stack((colbert_out1, colbert_out2), dim=-1) positive_score, negative_score = round( colbert_out1.mean().item(), 2), round(colbert_out2.mean().item(), 2) print("#>>> ", positive_score, negative_score, '\t\t|\t\t', positive_score - negative_score) loss = criterion(out, labels[:out.size(0)]) loss = loss / args.accumsteps loss.backward() train_loss += loss.item() torch.nn.utils.clip_grad_norm_(colbert.parameters(), 2.0) optimizer.step() optimizer.zero_grad() print_message(batch_idx, train_loss / (batch_idx + 1)) manage_checkpoints(colbert, optimizer, batch_idx + 1)
def load_colbert(args): print_message("#> Loading model checkpoint.") colbert = ColBERT.from_pretrained('bert-base-uncased', query_maxlen=args.query_maxlen, doc_maxlen=args.doc_maxlen, dim=args.dim, similarity_metric=args.similarity) colbert = colbert.to(DEVICE) checkpoint = load_checkpoint(args.checkpoint, colbert) colbert.eval() print('\n') return colbert, checkpoint
def encode(args, number_of_subindexes_already_saved=0): # TODO: Create a metadata file; save `args.input_arguments` in there create_directory(args.index) args.bsize = args.bsize * torch.cuda.device_count() print("#> Starting with NUM_GPUs =", torch.cuda.device_count()) print("#> Accordingly, setting total args.bsize =", args.bsize) colbert = args.colbert colbert.bert = nn.DataParallel(colbert.bert) colbert.linear = nn.DataParallel(colbert.linear) colbert = colbert.cuda() colbert.eval() print('\n\n\n') print("#> args.output_dir =", args.output_dir) print("#> number_of_subindexes_already_saved =", number_of_subindexes_already_saved) print('\n\n\n') super_batch_idx = 0 super_batch, batch_indices = [], [] with open(args.collection) as f: for idx, passage in enumerate(f): if len(super_batch) == SUPER_BATCH_SIZE: if super_batch_idx < number_of_subindexes_already_saved: print("#> Skipping super_batch_idx =", super_batch_idx, ".......") else: process_batch(args, super_batch_idx, batch_indices, super_batch) print_message("Processed", str(idx), "passages so far...\n") super_batch_idx += 1 super_batch, batch_indices = [], [] pid, passage = passage.split('\t') super_batch.append(passage) batch_indices.append(idx) assert int(pid) == idx if len(super_batch): process_batch(args, super_batch_idx, batch_indices, super_batch) super_batch_idx += 1
def load_topK(topK_path): queries = {} topK_docs = {} topK_pids = {} print_message("#> Loading the top-k per query from", topK_path, "...") with open(topK_path, encoding="utf-8") as f: for line in f: qid, pid, query, passage = line.split("\t") qid, pid = int(qid), int(pid) assert (qid not in queries) or (queries[qid] == query) queries[qid] = query topK_docs[qid] = topK_docs.get(qid, []) topK_docs[qid].append(passage) topK_pids[qid] = topK_pids.get(qid, []) topK_pids[qid].append(pid) assert all( len(topK_pids[qid]) == len(set(topK_pids[qid])) for qid in topK_pids) Ks = [len(topK_pids[qid]) for qid in topK_pids] print_message("#> max(Ks) =", max(Ks), ", avg(Ks) =", round(sum(Ks) / len(Ks), 2)) print_message("#> Loaded the top-k per query for", len(queries), "unique queries.\n") return queries, topK_docs, topK_pids
def process_batch(args, super_batch_idx, batch_indices, super_batch): colbert = args.colbert start_time = time() print_message("Start process_batch()", "") collection = [] collection_indices = [] with torch.no_grad(): super_batch = list(pool.map(Tokenizer.tokenize, super_batch)) print_message("Done tokenizing", "") sorted_idxs = sorted(range(len(super_batch)), key=lambda i: len(super_batch[i])) print_message("Done sorting", "") bucketed_outputs = [] for batch_idx in range(ceil(len(super_batch) / args.bsize)): D_idxs = sorted_idxs[batch_idx * args.bsize : (batch_idx + 1) * args.bsize] D = [super_batch[d] for d in D_idxs] bucketed_outputs.append( # to_indexed_list(*colbert.doc(D, return_mask=True), nbytes=args.bytes) to_indexed_list(colbert.doc(D), nbytes=args.bytes) ) collection_indices += [batch_indices[d] for d in D_idxs] for output in bucketed_outputs: collection += [d for d in output] throughput = round(len(super_batch) / (time() - start_time) * 60, 2) print("This super-batch's encoding rate:", throughput, "passages per minute.") output_path = os.path.join(args.index, str(super_batch_idx) + ".pt") offset, endpos = min(collection_indices), max(collection_indices) print("#> Writing", offset, "to", endpos, "to", output_path, "...") assert len(collection) == len(collection_indices) assert endpos - offset + 1 == len(collection_indices), len(collection_indices) assert len(collection_indices) == len(set(collection_indices)) collectionX = [None] * len(collection_indices) for pos, idx in enumerate(collection_indices): collectionX[idx - offset] = collection[pos] # modified for sparse # torch.save(collectionX, output_path) collectionX = sparse.csr_matrix(collectionX) sparse.save_npz(output_path, collectionX) print("#> Saved!\n\n")
def __init__(self, data_file): print_message("#> Training with the triples in", data_file, "...\n\n") self.reader = open(data_file, mode='r', encoding="utf-8")
def evaluate(args, index=None): qrels, queries, topK_docs, topK_pids = args.qrels, args.queries, args.topK_docs, args.topK_pids metrics = Metrics(mrr_depths={10}, recall_depths={50, 200, 1000}, total_queries=None) if index: args.buffer = torch.zeros(1000, args.doc_maxlen, args.dim, dtype=index[0].dtype) output_path = '.'.join([ str(x) for x in [args.run_name, 'tsv', int(time.time())] ]) output_path = os.path.join(args.output_dir, output_path) # TODO: Save an associated metadata file with the args.input_args with open(output_path, 'w') as outputfile: with torch.no_grad(): keys = sorted(list(queries.keys())) random.shuffle(keys) for query_idx, qid in enumerate(keys): query = queries[qid] print_message(query_idx, qid, query, '\n') if qrels and args.shortcircuit and len( set.intersection(set(qrels[qid]), set( topK_pids[qid]))) == 0: continue ranking = rerank(args, query, topK_pids[qid], topK_docs[qid], index) for i, (score, pid, passage) in enumerate(ranking): outputfile.write( '\t'.join([str(x) for x in [qid, pid, i + 1]]) + "\n") if i + 1 in [1, 2, 5, 10, 20, 100]: print("#> " + str(i + 1) + ") ", pid, ":", score, ' ', passage) if qrels: metrics.add(query_idx, qid, ranking, qrels[qid]) for i, (score, pid, passage) in enumerate(ranking): if pid in qrels[qid]: print("\n#> Found", pid, "at position", i + 1, "with score", score) print(passage) metrics.print_metrics(query_idx) print_message("#> checkpoint['batch'] =", args.checkpoint['batch'], '\n') print("output_path =", output_path) print("\n\n")
def train(args): if args.use_dense: colbert = ColBERT.from_pretrained( args.base_model, query_maxlen=args.query_maxlen, doc_maxlen=args.doc_maxlen, dim=args.dim, similarity_metric=args.similarity, ) else: colbert = SparseColBERT.from_pretrained( args.base_model, query_maxlen=args.query_maxlen, doc_maxlen=args.doc_maxlen, n=args.n, k=args.k, normalize_sparse=args.normalize_sparse, use_nonneg=args.use_nonneg, use_ortho=args.use_ortho, similarity_metric=args.similarity, ) colbert = colbert.to(DEVICE) colbert.train() criterion = nn.CrossEntropyLoss() optimizer = AdamW(colbert.parameters(), lr=args.lr, eps=1e-8) optimizer.zero_grad() labels = torch.zeros(args.bsize, dtype=torch.long, device=DEVICE) reader = TrainReader(args.triples) # dset = TrainDataset(args.triples) # loader = DataLoader(dset, batch_size=args.bsize, num_workers=0, pin_memory=True) train_loss = 0.0 PRINT_PERIOD = 100 for batch_idx in tqdm(range(args.maxsteps)): Batch = reader.get_minibatch(args.bsize) # for batch_idx, Batch in enumerate(tqdm(loader)): # if batch_idx > args.maxsteps: # print_message("#> Finish training at", batch_idx, "...\n\n") # break # Batch = [[q, pos, neg] for (q, pos, neg) in zip(Batch[0], Batch[1], Batch[2])] Batch = sorted(Batch, key=lambda x: max(len(x[1]), len(x[2]))) positive_score, negative_score = 0.0, 0.0 for B_idx in range(args.accumsteps): size = args.bsize // args.accumsteps B = Batch[B_idx * size : (B_idx + 1) * size] Q, D1, D2 = zip(*B) colbert_out, QQ_emb, DD_emb = colbert(Q + Q, D1 + D2, return_embedding=True) colbert_out1, colbert_out2 = colbert_out[: len(Q)], colbert_out[len(Q) :] out = torch.stack((colbert_out1, colbert_out2), dim=-1) positive_score, negative_score = ( round(colbert_out1.mean().item(), 2), round(colbert_out2.mean().item(), 2), ) # if (B_idx % PRINT_PERIOD) == 0: # print( # "#>>> ", # positive_score, # negative_score, # "\t\t|\t\t", # positive_score - negative_score, # ) loss = criterion(out, labels[: out.size(0)]) if args.use_ortho: Q_emb, D1_emb, D2_emb = ( QQ_emb[: len(Q)], DD_emb[: len(Q)], DD_emb[len(Q) :], ) loss_ortho = colbert.ortho_all([Q_emb, D1_emb, D2_emb]) loss += loss_ortho loss = loss / args.accumsteps loss.backward() train_loss += loss.item() torch.nn.utils.clip_grad_norm_(colbert.parameters(), 2.0) optimizer.step() optimizer.zero_grad() if (batch_idx % PRINT_PERIOD) == 0: # score print( "#>>> ", positive_score, negative_score, "\t\t|\t\t", positive_score - negative_score, ) # loss print_message(batch_idx, train_loss / (batch_idx + 1)) manage_checkpoints(colbert, optimizer, batch_idx + 1, args.output_dir)