Esempio n. 1
0
def main():
    random.seed(123456)

    parser = ArgumentParser(
        description=
        'Exhaustive (non-index-based) evaluation of re-ranking with ColBERT.')

    parser.add_argument('--index', dest='index', required=True)
    parser.add_argument('--checkpoint', dest='checkpoint', required=True)
    parser.add_argument('--collection',
                        dest='collection',
                        default='collection.tsv')

    parser.add_argument('--data_dir',
                        dest='data_dir',
                        default=DEFAULT_DATA_DIR)
    parser.add_argument('--output_dir',
                        dest='output_dir',
                        default='outputs.index/')

    parser.add_argument('--bsize', dest='bsize', default=128, type=int)
    parser.add_argument('--bytes',
                        dest='bytes',
                        default=2,
                        choices=[2, 4],
                        type=int)
    parser.add_argument('--subsample', dest='subsample',
                        default=None)  # TODO: Add this

    # TODO: For the following four arguments, default should be None. If None, they should be loaded from checkpoint.
    parser.add_argument('--similarity',
                        dest='similarity',
                        default='cosine',
                        choices=['cosine', 'l2'])
    parser.add_argument('--dim', dest='dim', default=128, type=int)
    parser.add_argument('--query_maxlen',
                        dest='query_maxlen',
                        default=32,
                        type=int)
    parser.add_argument('--doc_maxlen',
                        dest='doc_maxlen',
                        default=180,
                        type=int)

    # TODO: Add resume functionality

    args = parser.parse_args()
    args.input_arguments = args
    args.pool = Pool(10)

    create_directory(args.output_dir)

    args.index = os.path.join(args.output_dir, args.index)
    args.collection = os.path.join(args.data_dir, args.collection)

    args.colbert, args.checkpoint = load_colbert(args)

    encode(args)
Esempio n. 2
0
 def __init__(self, checkpoint, model_name='bert-base-uncased', tokenizer_name='bert-base-uncased', doc_attr="body", verbose=False):
     args = Object()
     args.query_maxlen = 32
     args.doc_maxlen = 180
     args.dim = 128
     args.bsize = 128
     args.similarity = 'cosine'
     args.checkpoint = checkpoint
     args.pool = Pool(10)
     args.bert = model_name
     args.bert_tokenizer = tokenizer_name
     args.colbert, args.checkpoint = load_colbert(args)
     self.args = args
     self.doc_attr = doc_attr
     self.verbose = verbose
Esempio n. 3
0
def main():
    random.seed(123456)

    parser = ArgumentParser(
        description=
        "Exhaustive (non-index-based) evaluation of re-ranking with ColBERT.")

    parser.add_argument("--checkpoint", dest="checkpoint", required=True)
    parser.add_argument("--topk", dest="topK", default="top1000.dev")
    parser.add_argument("--qrels", dest="qrels", default="qrels.dev.small.tsv")
    parser.add_argument("--shortcircuit",
                        dest="shortcircuit",
                        default=False,
                        action="store_true")

    parser.add_argument("--data_dir",
                        dest="data_dir",
                        default=DEFAULT_DATA_DIR)
    parser.add_argument("--output_dir",
                        dest="output_dir",
                        default="outputs.test/")

    parser.add_argument("--bsize", dest="bsize", default=128, type=int)
    parser.add_argument("--subsample", dest="subsample",
                        default=None)  # TODO: Add this
    parser.add_argument("--dense", action="store_true")

    # TODO: For the following four arguments, default should be None. If None, they should be loaded from checkpoint.
    parser.add_argument("--similarity",
                        dest="similarity",
                        default="cosine",
                        choices=["cosine", "l2"])
    parser.add_argument("--dim", dest="dim", default=128, type=int)
    parser.add_argument("--query_maxlen",
                        dest="query_maxlen",
                        default=32,
                        type=int)
    parser.add_argument("--doc_maxlen",
                        dest="doc_maxlen",
                        default=180,
                        type=int)
    parser.add_argument("--n", type=int, required=True)
    parser.add_argument("--k", type=float, required=True)
    parser.add_argument("--dont_normalize_sparse",
                        dest="normalize_sparse",
                        action="store_false")
    parser.add_argument("--use_nonneg", action="store_true")
    parser.add_argument("--use_ortho", action="store_true")

    args = parser.parse_args()
    args.input_arguments = args

    assert (not args.shortcircuit) or args.qrels, (
        "Short-circuiting (i.e., applying minimal computation to queries with no positives [in the re-ranked set]) "
        "can only be applied if qrels is provided.")

    args.pool = Pool(10)
    args.run_name = args.topK

    create_directory(args.output_dir)

    args.topK = os.path.join(args.data_dir, args.topK)

    if args.qrels:
        args.qrels = os.path.join(args.data_dir, args.qrels)

    args.checkpoint_path = args.checkpoint
    args.colbert, args.checkpoint = load_colbert(args)
    args.qrels = load_qrels(args.qrels)
    args.queries, args.topK_docs, args.topK_pids = load_topK(args.topK)

    evaluate_recall(args.qrels, args.queries, args.topK_pids)
    evaluate(args)
Esempio n. 4
0
def main():
    random.seed(123456)

    parser = ArgumentParser(
        description=
        "Exhaustive (non-index-based) evaluation of re-ranking with ColBERT.")

    parser.add_argument("--index", dest="index", required=True)
    parser.add_argument("--checkpoint", dest="checkpoint", required=True)
    parser.add_argument("--collection",
                        dest="collection",
                        default="collection.tsv")

    parser.add_argument("--data_dir",
                        dest="data_dir",
                        default=DEFAULT_DATA_DIR)
    parser.add_argument("--output_dir",
                        dest="output_dir",
                        default="outputs.index/")

    parser.add_argument("--bsize", dest="bsize", default=128, type=int)
    parser.add_argument("--bytes",
                        dest="bytes",
                        default=4,
                        choices=[2, 4],
                        type=int)
    parser.add_argument("--subsample", dest="subsample",
                        default=None)  # TODO: Add this

    # TODO: For the following four arguments, default should be None. If None, they should be loaded from checkpoint.
    parser.add_argument("--similarity",
                        dest="similarity",
                        default="cosine",
                        choices=["cosine", "l2"])
    parser.add_argument("--dim", dest="dim", default=128, type=int)
    parser.add_argument("--query_maxlen",
                        dest="query_maxlen",
                        default=32,
                        type=int)
    parser.add_argument("--doc_maxlen",
                        dest="doc_maxlen",
                        default=180,
                        type=int)
    parser.add_argument("--dense", action="store_true")
    parser.add_argument("--n", default=4096, type=int)
    parser.add_argument("--k", default=0.005, type=float)
    parser.add_argument("--dont_normalize_sparse",
                        dest="normalize_sparse",
                        action="store_false")

    # TODO: Add resume functionality

    args = parser.parse_args()
    args.input_arguments = args
    args.pool = Pool(4)

    create_directory(args.output_dir)

    args.index = os.path.join(args.output_dir, args.index)
    args.collection = os.path.join(args.data_dir, args.collection)

    args.colbert, args.checkpoint = load_colbert(args)

    encode(args)
Esempio n. 5
0
def main():
    random.seed(123456)

    parser = ArgumentParser(
        description=
        'Exhaustive (non-index-based) evaluation of re-ranking with ColBERT.')

    parser.add_argument('--checkpoint', dest='checkpoint', required=True)
    parser.add_argument('--topk', dest='topK', required=True)
    parser.add_argument('--qrels', dest='qrels', default=None)
    parser.add_argument('--shortcircuit',
                        dest='shortcircuit',
                        default=False,
                        action='store_true')

    parser.add_argument('--data_dir',
                        dest='data_dir',
                        default=DEFAULT_DATA_DIR)
    parser.add_argument('--output_dir',
                        dest='output_dir',
                        default='outputs.test/')

    parser.add_argument('--bsize', dest='bsize', default=128, type=int)
    parser.add_argument('--subsample', dest='subsample',
                        default=None)  # TODO: Add this

    # TODO: For the following four arguments, default should be None. If None, they should be loaded from checkpoint.
    parser.add_argument('--similarity',
                        dest='similarity',
                        default='cosine',
                        choices=['cosine', 'l2'])
    parser.add_argument('--dim', dest='dim', default=128, type=int)
    parser.add_argument('--query_maxlen',
                        dest='query_maxlen',
                        default=32,
                        type=int)
    parser.add_argument('--doc_maxlen',
                        dest='doc_maxlen',
                        default=180,
                        type=int)

    args = parser.parse_args()
    args.input_arguments = args

    assert (not args.shortcircuit) or args.qrels, \
        "Short-circuiting (i.e., applying minimal computation to queries with no positives [in the re-ranked set]) " \
        "can only be applied if qrels is provided."

    args.pool = Pool(10)
    args.run_name = args.topK

    create_directory(args.output_dir)

    args.topK = os.path.join(args.data_dir, args.topK)

    if args.qrels:
        args.qrels = os.path.join(args.data_dir, args.qrels)

    args.colbert, args.checkpoint = load_colbert(args)
    args.qrels = load_qrels(args.qrels)
    args.queries, args.topK_docs, args.topK_pids = load_topK(args.topK)

    evaluate_recall(args.qrels, args.queries, args.topK_pids)
    evaluate(args)