Exemple #1
0
def load_qrels(qrels_path):
    if qrels_path is None:
        return None

    print_message("#> Loading qrels from", qrels_path, "...")

    qrels = {}
    with open(qrels_path, mode="r", encoding="utf-8") as f:
        for line in f:
            qid, x, pid, y = map(int, line.strip().split("\t"))
            assert x == 0 and y == 1
            qrels[qid] = qrels.get(qid, [])
            qrels[qid].append(pid)

    assert all(len(qrels[qid]) == len(set(qrels[qid])) for qid in qrels)

    avg_positive = round(sum(len(qrels[qid]) for qid in qrels) / len(qrels), 2)

    print_message(
        "#> Loaded qrels for",
        len(qrels),
        "unique queries with",
        avg_positive,
        "positives per query on average.\n",
    )

    return qrels
Exemple #2
0
def load_colbert(args):
    print_message("#> Loading model checkpoint.")
    if args.dense:
        colbert = ColBERT.from_pretrained(
            "bert-base-uncased",
            query_maxlen=args.query_maxlen,
            doc_maxlen=args.doc_maxlen,
            dim=args.dim,
            similarity_metric=args.similarity,
        )
    else:
        colbert = SparseColBERT.from_pretrained(
            "bert-base-uncased",
            query_maxlen=args.query_maxlen,
            doc_maxlen=args.doc_maxlen,
            k=args.k,
            n=args.n,
            use_nonneg=args.use_nonneg,
            normalize_sparse=args.normalize_sparse,
            similarity_metric=args.similarity,
        )
    colbert = colbert.to(DEVICE)
    checkpoint = load_checkpoint(args.checkpoint, colbert)
    colbert.eval()

    print("\n")

    return colbert, checkpoint
 def __init__(self, data_file):
     print_message("#> Training with the triples in", data_file, "...\n\n")
     if data_file.endswith(".parquet"):
         self.data = pd.read_parquet(data_file).values.tolist()
     else:
         self.data = pd.read_csv(
             data_file, sep="\t", names=["query", "pos", "neg"]
         ).values.tolist()
Exemple #4
0
def load_colbert(args):
    print_message("#> Loading model checkpoint.")
    colbert = MultiBERT.from_pretrained('bert-base-uncased')
    colbert = colbert.to(DEVICE)
    checkpoint = load_checkpoint(args.checkpoint, colbert)
    colbert.eval()

    print('\n')

    return colbert, checkpoint
def train(args):
    colbert = ColBERT.from_pretrained('bert-base-uncased',
                                      query_maxlen=args.query_maxlen,
                                      doc_maxlen=args.doc_maxlen,
                                      dim=args.dim,
                                      similarity_metric=args.similarity)
    colbert = colbert.to(DEVICE)
    colbert.train()

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(colbert.parameters(), lr=args.lr, eps=1e-8)

    optimizer.zero_grad()
    labels = torch.zeros(args.bsize, dtype=torch.long, device=DEVICE)

    reader = TrainReader(args.triples)
    train_loss = 0.0

    for batch_idx in range(args.maxsteps):
        Batch = reader.get_minibatch(args.bsize)
        Batch = sorted(Batch, key=lambda x: max(len(x[1]), len(x[2])))

        for B_idx in range(args.accumsteps):
            size = args.bsize // args.accumsteps
            B = Batch[B_idx * size:(B_idx + 1) * size]
            Q, D1, D2 = zip(*B)

            colbert_out = colbert(Q + Q, D1 + D2)
            colbert_out1, colbert_out2 = colbert_out[:len(Q)], colbert_out[
                len(Q):]

            out = torch.stack((colbert_out1, colbert_out2), dim=-1)

            positive_score, negative_score = round(
                colbert_out1.mean().item(),
                2), round(colbert_out2.mean().item(), 2)
            print("#>>>   ", positive_score, negative_score, '\t\t|\t\t',
                  positive_score - negative_score)

            loss = criterion(out, labels[:out.size(0)])
            loss = loss / args.accumsteps
            loss.backward()

            train_loss += loss.item()

        torch.nn.utils.clip_grad_norm_(colbert.parameters(), 2.0)

        optimizer.step()
        optimizer.zero_grad()

        print_message(batch_idx, train_loss / (batch_idx + 1))

        manage_checkpoints(colbert, optimizer, batch_idx + 1)
Exemple #6
0
def load_colbert(args):
    print_message("#> Loading model checkpoint.")
    colbert = ColBERT.from_pretrained('bert-base-uncased',
                                      query_maxlen=args.query_maxlen,
                                      doc_maxlen=args.doc_maxlen,
                                      dim=args.dim,
                                      similarity_metric=args.similarity)
    colbert = colbert.to(DEVICE)
    checkpoint = load_checkpoint(args.checkpoint, colbert)
    colbert.eval()

    print('\n')

    return colbert, checkpoint
Exemple #7
0
def encode(args, number_of_subindexes_already_saved=0):
    # TODO: Create a metadata file; save `args.input_arguments` in there
    create_directory(args.index)

    args.bsize = args.bsize * torch.cuda.device_count()

    print("#> Starting with NUM_GPUs =", torch.cuda.device_count())
    print("#> Accordingly, setting total args.bsize =", args.bsize)

    colbert = args.colbert
    colbert.bert = nn.DataParallel(colbert.bert)
    colbert.linear = nn.DataParallel(colbert.linear)
    colbert = colbert.cuda()
    colbert.eval()

    print('\n\n\n')
    print("#> args.output_dir =", args.output_dir)
    print("#> number_of_subindexes_already_saved =",
          number_of_subindexes_already_saved)
    print('\n\n\n')

    super_batch_idx = 0
    super_batch, batch_indices = [], []

    with open(args.collection) as f:
        for idx, passage in enumerate(f):
            if len(super_batch) == SUPER_BATCH_SIZE:
                if super_batch_idx < number_of_subindexes_already_saved:
                    print("#> Skipping super_batch_idx =", super_batch_idx,
                          ".......")
                else:
                    process_batch(args, super_batch_idx, batch_indices,
                                  super_batch)

                print_message("Processed", str(idx), "passages so far...\n")

                super_batch_idx += 1
                super_batch, batch_indices = [], []

            pid, passage = passage.split('\t')
            super_batch.append(passage)
            batch_indices.append(idx)

            assert int(pid) == idx

    if len(super_batch):
        process_batch(args, super_batch_idx, batch_indices, super_batch)
        super_batch_idx += 1
Exemple #8
0
def load_topK(topK_path):
    queries = {}
    topK_docs = {}
    topK_pids = {}

    print_message("#> Loading the top-k per query from", topK_path, "...")

    with open(topK_path, encoding="utf-8") as f:
        for line in f:
            qid, pid, query, passage = line.split("\t")
            qid, pid = int(qid), int(pid)

            assert (qid not in queries) or (queries[qid] == query)
            queries[qid] = query
            topK_docs[qid] = topK_docs.get(qid, [])
            topK_docs[qid].append(passage)
            topK_pids[qid] = topK_pids.get(qid, [])
            topK_pids[qid].append(pid)

    assert all(
        len(topK_pids[qid]) == len(set(topK_pids[qid])) for qid in topK_pids)

    Ks = [len(topK_pids[qid]) for qid in topK_pids]

    print_message("#> max(Ks) =", max(Ks), ", avg(Ks) =",
                  round(sum(Ks) / len(Ks), 2))
    print_message("#> Loaded the top-k per query for", len(queries),
                  "unique queries.\n")

    return queries, topK_docs, topK_pids
Exemple #9
0
def process_batch(args, super_batch_idx, batch_indices, super_batch):
    colbert = args.colbert

    start_time = time()
    print_message("Start process_batch()", "")

    collection = []
    collection_indices = []

    with torch.no_grad():
        super_batch = list(pool.map(Tokenizer.tokenize, super_batch))
        print_message("Done tokenizing", "")

        sorted_idxs = sorted(range(len(super_batch)), key=lambda i: len(super_batch[i]))
        print_message("Done sorting", "")

        bucketed_outputs = []

        for batch_idx in range(ceil(len(super_batch) / args.bsize)):
            D_idxs = sorted_idxs[batch_idx * args.bsize : (batch_idx + 1) * args.bsize]
            D = [super_batch[d] for d in D_idxs]
            bucketed_outputs.append(
                # to_indexed_list(*colbert.doc(D, return_mask=True), nbytes=args.bytes)
                to_indexed_list(colbert.doc(D), nbytes=args.bytes)
            )
            collection_indices += [batch_indices[d] for d in D_idxs]

        for output in bucketed_outputs:
            collection += [d for d in output]

    throughput = round(len(super_batch) / (time() - start_time) * 60, 2)
    print("This super-batch's encoding rate:", throughput, "passages per minute.")

    output_path = os.path.join(args.index, str(super_batch_idx) + ".pt")
    offset, endpos = min(collection_indices), max(collection_indices)

    print("#> Writing", offset, "to", endpos, "to", output_path, "...")

    assert len(collection) == len(collection_indices)
    assert endpos - offset + 1 == len(collection_indices), len(collection_indices)
    assert len(collection_indices) == len(set(collection_indices))

    collectionX = [None] * len(collection_indices)

    for pos, idx in enumerate(collection_indices):
        collectionX[idx - offset] = collection[pos]

    # modified for sparse
    # torch.save(collectionX, output_path)
    collectionX = sparse.csr_matrix(collectionX)
    sparse.save_npz(output_path, collectionX)

    print("#> Saved!\n\n")
 def __init__(self, data_file):
     print_message("#> Training with the triples in", data_file, "...\n\n")
     self.reader = open(data_file, mode='r', encoding="utf-8")
Exemple #11
0
def evaluate(args, index=None):
    qrels, queries, topK_docs, topK_pids = args.qrels, args.queries, args.topK_docs, args.topK_pids

    metrics = Metrics(mrr_depths={10},
                      recall_depths={50, 200, 1000},
                      total_queries=None)

    if index:
        args.buffer = torch.zeros(1000,
                                  args.doc_maxlen,
                                  args.dim,
                                  dtype=index[0].dtype)

    output_path = '.'.join([
        str(x)
        for x in [args.run_name, 'tsv', int(time.time())]
    ])
    output_path = os.path.join(args.output_dir, output_path)

    # TODO: Save an associated metadata file with the args.input_args

    with open(output_path, 'w') as outputfile:
        with torch.no_grad():
            keys = sorted(list(queries.keys()))
            random.shuffle(keys)

            for query_idx, qid in enumerate(keys):
                query = queries[qid]
                print_message(query_idx, qid, query, '\n')

                if qrels and args.shortcircuit and len(
                        set.intersection(set(qrels[qid]), set(
                            topK_pids[qid]))) == 0:
                    continue

                ranking = rerank(args, query, topK_pids[qid], topK_docs[qid],
                                 index)

                for i, (score, pid, passage) in enumerate(ranking):
                    outputfile.write(
                        '\t'.join([str(x) for x in [qid, pid, i + 1]]) + "\n")

                    if i + 1 in [1, 2, 5, 10, 20, 100]:
                        print("#> " + str(i + 1) + ") ", pid, ":", score,
                              '    ', passage)

                if qrels:
                    metrics.add(query_idx, qid, ranking, qrels[qid])

                    for i, (score, pid, passage) in enumerate(ranking):
                        if pid in qrels[qid]:
                            print("\n#> Found", pid, "at position", i + 1,
                                  "with score", score)
                            print(passage)

                    metrics.print_metrics(query_idx)

                print_message("#> checkpoint['batch'] =",
                              args.checkpoint['batch'], '\n')
                print("output_path =", output_path)
                print("\n\n")
def train(args):
    if args.use_dense:
        colbert = ColBERT.from_pretrained(
            args.base_model,
            query_maxlen=args.query_maxlen,
            doc_maxlen=args.doc_maxlen,
            dim=args.dim,
            similarity_metric=args.similarity,
        )
    else:
        colbert = SparseColBERT.from_pretrained(
            args.base_model,
            query_maxlen=args.query_maxlen,
            doc_maxlen=args.doc_maxlen,
            n=args.n,
            k=args.k,
            normalize_sparse=args.normalize_sparse,
            use_nonneg=args.use_nonneg,
            use_ortho=args.use_ortho,
            similarity_metric=args.similarity,
        )
    colbert = colbert.to(DEVICE)
    colbert.train()

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(colbert.parameters(), lr=args.lr, eps=1e-8)

    optimizer.zero_grad()
    labels = torch.zeros(args.bsize, dtype=torch.long, device=DEVICE)

    reader = TrainReader(args.triples)
    # dset = TrainDataset(args.triples)
    # loader = DataLoader(dset, batch_size=args.bsize, num_workers=0, pin_memory=True)
    train_loss = 0.0

    PRINT_PERIOD = 100

    for batch_idx in tqdm(range(args.maxsteps)):
        Batch = reader.get_minibatch(args.bsize)
        # for batch_idx, Batch in enumerate(tqdm(loader)):
        #     if batch_idx > args.maxsteps:
        #         print_message("#> Finish training at", batch_idx, "...\n\n")
        #         break
        #     Batch = [[q, pos, neg] for (q, pos, neg) in zip(Batch[0], Batch[1], Batch[2])]
        Batch = sorted(Batch, key=lambda x: max(len(x[1]), len(x[2])))

        positive_score, negative_score = 0.0, 0.0
        for B_idx in range(args.accumsteps):
            size = args.bsize // args.accumsteps
            B = Batch[B_idx * size : (B_idx + 1) * size]
            Q, D1, D2 = zip(*B)

            colbert_out, QQ_emb, DD_emb = colbert(Q + Q, D1 + D2, return_embedding=True)
            colbert_out1, colbert_out2 = colbert_out[: len(Q)], colbert_out[len(Q) :]

            out = torch.stack((colbert_out1, colbert_out2), dim=-1)

            positive_score, negative_score = (
                round(colbert_out1.mean().item(), 2),
                round(colbert_out2.mean().item(), 2),
            )

            # if (B_idx % PRINT_PERIOD) == 0:
            #     print(
            #         "#>>>   ",
            #         positive_score,
            #         negative_score,
            #         "\t\t|\t\t",
            #         positive_score - negative_score,
            #     )

            loss = criterion(out, labels[: out.size(0)])
            if args.use_ortho:
                Q_emb, D1_emb, D2_emb = (
                    QQ_emb[: len(Q)],
                    DD_emb[: len(Q)],
                    DD_emb[len(Q) :],
                )
                loss_ortho = colbert.ortho_all([Q_emb, D1_emb, D2_emb])
                loss += loss_ortho
            loss = loss / args.accumsteps
            loss.backward()

            train_loss += loss.item()

        torch.nn.utils.clip_grad_norm_(colbert.parameters(), 2.0)

        optimizer.step()
        optimizer.zero_grad()

        if (batch_idx % PRINT_PERIOD) == 0:
            # score
            print(
                "#>>>   ",
                positive_score,
                negative_score,
                "\t\t|\t\t",
                positive_score - negative_score,
            )

            # loss
            print_message(batch_idx, train_loss / (batch_idx + 1))

        manage_checkpoints(colbert, optimizer, batch_idx + 1, args.output_dir)