Esempio n. 1
0
                            help=f"Format of output. Available: {[x.value for x in list(OutputFormat)]}")
    run_parser.add_argument('--output', type=str, metavar='path', required=False, help="Path to output file.")
    run_parser.add_argument('--max-passage', action='store_true',
                            default=False, help="Select only max passage from document.")
    run_parser.add_argument('--max-passage-hits', type=int, metavar='num', required=False, default=100,
                            help="Final number of hits when selecting only max passage.")
    run_parser.add_argument('--max-passage-delimiter', type=str, metavar='str', required=False, default='#',
                            help="Delimiter between docid and passage id.")
    run_parser.add_argument('--batch-size', type=int, metavar='num', required=False,
                            default=1, help="Specify batch size to search the collection concurrently.")
    run_parser.add_argument('--threads', type=int, metavar='num', required=False,
                            default=1, help="Maximum number of threads to use.")

    args = parse_args(parser, commands)

    query_iterator = get_query_iterator(args.run.topics, TopicsFormat(args.run.topics_format))
    topics = query_iterator.topics

    query_encoder = init_query_encoder(args.dense.encoder,
                                       args.dense.tokenizer,
                                       args.run.topics,
                                       args.dense.encoded_queries,
                                       args.dense.device)

    if os.path.exists(args.dense.index):
        # create searcher from index directory
        dsearcher = SimpleDenseSearcher(args.dense.index, query_encoder)
    else:
        # create searcher from prebuilt index name
        dsearcher = SimpleDenseSearcher.from_prebuilt_index(args.dense.index, query_encoder)
Esempio n. 2
0
        required=False,
        default=1,
        help="Specify batch size to search the collection concurrently.")
    parser.add_argument('--threads',
                        type=int,
                        metavar='num',
                        required=False,
                        default=1,
                        help="Maximum number of threads to use.")
    parser.add_argument('--tokenizer',
                        type=str,
                        help='tokenizer used to preprocess topics')
    args = parser.parse_args()

    query_iterator = get_query_iterator(args.topics,
                                        TopicsFormat(args.topics_format))
    topics = query_iterator.topics

    if os.path.exists(args.index):
        # create searcher from index directory
        searcher = SimpleSearcher(args.index)
    else:
        # create searcher from prebuilt index name
        searcher = SimpleSearcher.from_prebuilt_index(args.index)

    if args.language != 'en':
        searcher.set_language(args.language)

    if not searcher:
        exit()
Esempio n. 3
0
    parser.add_argument('--encoder',
                        type=str,
                        help='encoder name or path',
                        default='facebook/dpr-question_encoder-multiset-base',
                        required=False)
    parser.add_argument('--output',
                        type=str,
                        help='path to store query embeddings',
                        required=True)
    parser.add_argument('--device',
                        type=str,
                        help='device cpu or cuda [cuda:0, cuda:1...]',
                        default='cpu',
                        required=False)
    args = parser.parse_args()

    query_iterator = get_query_iterator(
        args.topics, TopicsFormat(TopicsFormat.DEFAULT.value))
    topics = query_iterator.topics

    encoder = DkrrDprQueryEncoder(args.encoder, args.device)

    embeddings = {'id': [], 'text': [], 'embedding': []}
    for index, (topic_id, text) in enumerate(
            tqdm(query_iterator, total=len(topics.keys()))):
        embeddings['id'].append(topic_id)
        embeddings['text'].append(text)
        embeddings['embedding'].append(encoder.encode(text))
    embeddings = pd.DataFrame(embeddings)
    embeddings.to_pickle(args.output)