コード例 #1
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Compute embeddings for KILT topics')
    parser.add_argument('--topics', required=True)
    parser.add_argument('--output', default="embedding.pkl", help="Name and path to output file.")
    parser.add_argument('--encoder', metavar='path to query encoder checkpoint or encoder name',
                        required=True,
                        help="Path to query encoder pytorch checkpoint or hgf encoder model name")
    parser.add_argument('--tokenizer', metavar='name or path',
                        required=True,
                        help="Path to a hgf tokenizer name or path")
    parser.add_argument('--device', metavar='device to run query encoder', required=False, default='cpu',
                        help="Device to run query encoder, cpu or [cuda:0, cuda:1, ...]")
    args = parser.parse_args()

    query_iterator = get_query_iterator(args.topics, TopicsFormat.KILT)
    query_encoder = DprQueryEncoder(encoder_dir=args.encoder, tokenizer_name=args.tokenizer, device=args.device)

    texts = []
    embeddings = []
    for i, (topic_id, text) in enumerate(tqdm(query_iterator)):
        texts.append(text)
        embeddings.append(query_encoder.encode(text))

    df = pd.DataFrame({
        'text': texts,
        'embedding': embeddings
    })

    df.to_pickle(args.output)
コード例 #2
0
ファイル: __main__.py プロジェクト: mrkarezina/pyserini
        metavar='num',
        required=False,
        default=1,
        help="Specify batch size to search the collection concurrently.")
    parser.add_argument('--threads',
                        type=int,
                        metavar='num',
                        required=False,
                        default=1,
                        help="Maximum number of threads to use.")
    parser.add_argument('--tokenizer',
                        type=str,
                        help='tokenizer used to preprocess topics')
    args = parser.parse_args()

    query_iterator = get_query_iterator(args.topics,
                                        TopicsFormat(args.topics_format))
    topics = query_iterator.topics

    if os.path.exists(args.index):
        # create searcher from index directory
        searcher = SimpleSearcher(args.index)
    else:
        # create searcher from prebuilt index name
        searcher = SimpleSearcher.from_prebuilt_index(args.index)

    if args.language != 'en':
        searcher.set_language(args.language)

    if not searcher:
        exit()
コード例 #3
0
ファイル: encode_queries.py プロジェクト: yuki617/pyserini
    parser.add_argument('--encoder',
                        type=str,
                        help='encoder name or path',
                        default='facebook/dpr-question_encoder-multiset-base',
                        required=False)
    parser.add_argument('--output',
                        type=str,
                        help='path to store query embeddings',
                        required=True)
    parser.add_argument('--device',
                        type=str,
                        help='device cpu or cuda [cuda:0, cuda:1...]',
                        default='cpu',
                        required=False)
    args = parser.parse_args()

    query_iterator = get_query_iterator(
        args.topics, TopicsFormat(TopicsFormat.DEFAULT.value))
    topics = query_iterator.topics

    encoder = DkrrDprQueryEncoder(args.encoder, args.device)

    embeddings = {'id': [], 'text': [], 'embedding': []}
    for index, (topic_id, text) in enumerate(
            tqdm(query_iterator, total=len(topics.keys()))):
        embeddings['id'].append(topic_id)
        embeddings['text'].append(text)
        embeddings['embedding'].append(encoder.encode(text))
    embeddings = pd.DataFrame(embeddings)
    embeddings.to_pickle(args.output)