Beispiel #1
0
    def test_model_from_pretrained(self):
        for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRContextEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRContextEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRQuestionEncoder.from_pretrained(model_name)
            self.assertIsNotNone(model)

        for model_name in DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = DPRReader.from_pretrained(model_name)
            self.assertIsNotNone(model)
    def __init__(self,
                 contexts=None,
                 fill_context_embeddings=True,
                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        super(LongQAModel, self).__init__()
        self.device = device
        self.c_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
        self.c_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
        self.q_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)
        self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
        self.r_model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base').to(device)
        self.r_tokenizer = DPRReaderTokenizerFast.from_pretrained('facebook/dpr-reader-single-nq-base')
        self.contexts = contexts
        # Not enough time to load context embeddings in AWS SageMaker,
        # but can fill weights from saved state dict after loading model.
        if not self.contexts:
            with open('code/contexts.json') as f:
                self.contexts = json.load(f)
#             output_features = self.c_model.ctx_encoder.bert_model.pooler.dense.out_features
#             self.context_embeddings = nn.Parameter(torch.zeros(len(self.contexts), output_features)).to(device)
#         else:
        context_embeddings = []
        with torch.no_grad():
           for context in self.contexts:
               input_ids = self.c_tokenizer(context, return_tensors='pt').to(device)["input_ids"]
               output = self.c_model(input_ids)
               context_embeddings.append(output.pooler_output)
        self.context_embeddings = nn.Parameter(torch.cat(context_embeddings, dim=0)).to(device)
        print('cwd!:', os.getcwd())
        print(os.listdir('code'))
        self.noise_remover = joblib.load('code/filter_model.sav')
def main(
    rag_example_args: "RagExampleArguments",
    processing_args: "ProcessingArguments",
    index_hnsw_args: "IndexHnswArguments",
):

    ######################################
    logger.info("Step 1 - Create the dataset")
    ######################################

    # The dataset needed for RAG must have three columns:
    # - title (string): title of the document
    # - text (string): text of a passage of the document
    # - embeddings (array of dimension d): DPR representation of the passage
    # Let's say you have documents in tab-separated csv files with columns "title" and "text"
    assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file"

    # You can load a Dataset object this way
    dataset = load_dataset(
        "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
    )

    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files

    # Then split the documents into passages of 100 words
    dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)

    # And compute the embeddings
    ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name)
    new_features = Features(
        {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
    )  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
        batched=True,
        batch_size=processing_args.batch_size,
        features=new_features,
    )

    # And finally save your dataset
    passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset")
    dataset.save_to_disk(passages_path)
    # from datasets import load_from_disk
    # dataset = load_from_disk(passages_path)  # to reload the dataset

    ######################################
    logger.info("Step 2 - Index the dataset")
    ######################################

    # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
    index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    # And save the index
    index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss")
    dataset.get_index("embeddings").save(index_path)
Beispiel #4
0
 def __init__(self):
     self.ctx_encoder = DPRContextEncoder.from_pretrained(
         "facebook/dpr-ctx_encoder-multiset-base").to(Config.device)
     self.q_encoder = DPRQuestionEncoder.from_pretrained(
         "facebook/dpr-question_encoder-multiset-base").to(Config.device)
     self.ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
         "facebook/dpr-ctx_encoder-multiset-base")
     self.q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained(
         "facebook/dpr-question_encoder-multiset-base")
Beispiel #5
0
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device):
    """
    Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py
    """
    import faiss

    if isinstance(data, str):
        dataset = load_dataset("csv",
                               data_files=data,
                               delimiter="\t",
                               column_names=["title", "text"])
    else:
        dataset = HFDataset.from_pandas(data)

    dataset = dataset.map(
        partial(split_documents,
                split_text_n=args.split_text_n,
                split_text_character=args.split_text_character),
        batched=True,
        num_proc=args.process_count,
    )

    ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to(
        device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
        ctx_encoder_name)

    new_features = Features({
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32"))
    })  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed,
                ctx_encoder=ctx_encoder,
                ctx_tokenizer=ctx_tokenizer,
                device=device),
        batched=True,
        batch_size=args.rag_embed_batch_size,
        features=new_features,
    )
    if isinstance(data, str):
        dataset = dataset["train"]

    if args.save_knowledge_dataset:
        output_dataset_directory = os.path.join(args.output_dir,
                                                "knowledge_dataset")
        os.makedirs(output_dataset_directory, exist_ok=True)
        dataset.save_to_disk(output_dataset_directory)

    index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m,
                                faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    return dataset
Beispiel #6
0
 def __init__(self):
     self.tokenizer_q = DPRQuestionEncoderTokenizer.from_pretrained(
         'facebook/dpr-question_encoder-single-nq-base')
     self.model_q = DPRQuestionEncoder.from_pretrained(
         'facebook/dpr-question_encoder-single-nq-base')
     self.model_q.to(DEVICE)
     self.tokenizer_d = DPRContextEncoderTokenizer.from_pretrained(
         'facebook/dpr-ctx_encoder-single-nq-base')
     self.model_d = DPRContextEncoder.from_pretrained(
         'facebook/dpr-ctx_encoder-single-nq-base')
     self.model_d.to(DEVICE)
Beispiel #7
0
    def __init__(self):
        self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
            'facebook/dpr-ctx_encoder-single-nq-base')
        self.context_model = DPRContextEncoder.from_pretrained(
            'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)

        self.query_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
            'facebook/dpr-question_encoder-single-nq-base')
        self.query_encoder = DPRQuestionEncoder.from_pretrained(
            "facebook/dpr-question_encoder-single-nq-base")

        self.reader_tokenizer = DPRReaderTokenizer.from_pretrained(
            'facebook/dpr-reader-single-nq-base')
        self.reader_model = DPRReader.from_pretrained(
            'facebook/dpr-reader-single-nq-base', return_dict=True)
        self.vector_length = 768
Beispiel #8
0
    def load_dataset(self) -> None:
        logger.debug('loading rag dataset: %s', self.name)

        self.dataset = load_dataset('csv',
                                    data_files=[self.csv_path],
                                    split='train',
                                    delimiter=',',
                                    column_names=['title', 'text'])

        self.dataset = self.dataset.map(
            split_documents,
            batched=False,
            num_proc=6,
            batch_size=100,
        )

        ctx_encoder = DPRContextEncoder.from_pretrained(
            self.context_encoder).to(device=self.device)
        ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
            self.context_encoder)
        new_features = Features({
            'text': Value('string'),
            'title': Value('string'),
            'embeddings': Sequence(Value('float32'))
        })  # optional, save as float32 instead of float64 to save space

        self.dataset = self.dataset.map(
            partial(embed,
                    ctx_encoder=ctx_encoder,
                    ctx_tokenizer=ctx_tokenizer,
                    device=self.device),
            batched=True,
            batch_size=16,
            features=new_features,
        )

        self.dataset.save_to_disk(self.dataset_path)

        index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
        self.dataset.add_faiss_index('embeddings', custom_index=index)

        self.dataset.get_index('embeddings').save(self.faiss_path)
Beispiel #9
0
    def test_embed(self):

        ctx_encoder = DPRContextEncoder.from_pretrained(
           'facebook/dpr-ctx_encoder-multiset-base'
        ).to(device='cpu')
        ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
           'facebook/dpr-ctx_encoder-multiset-base'
        )

        self.assertEqual(
            len(embed(
                {
                    'title': 'something',
                    'text': 'blah'
                },
                ctx_encoder,
                ctx_tokenizer,
                'cpu'
            )['embeddings'][0]),
            768
        )
def download_model(outputdir_question_tokenizer: str,
                   outputdir_question_encoder: str,
                   outputdir_ctx_tokenizer: str, outputdir_ctx_encoder: str):
    q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
        "facebook/dpr-question_encoder-single-nq-base")
    print("Save question tokenizer to ", outputdir_question_tokenizer)
    q_tokenizer.save_pretrained(outputdir_question_tokenizer)

    q_encoder = DPRQuestionEncoder.from_pretrained(
        "facebook/dpr-question_encoder-single-nq-base")
    print("Save question encoder to ", outputdir_question_encoder)
    q_encoder.save_pretrained(outputdir_question_encoder)

    ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
        "facebook/dpr-ctx_encoder-single-nq-base")
    print("Save context tokenizer to ", outputdir_ctx_tokenizer)
    ctx_tokenizer.save_pretrained(outputdir_ctx_tokenizer)

    ctx_encoder = DPRContextEncoder.from_pretrained(
        "facebook/dpr-ctx_encoder-single-nq-base")
    print("Save context encoder to", outputdir_ctx_encoder)
    ctx_encoder.save_pretrained(outputdir_ctx_encoder)
               map_location=torch.device('cpu')))
model.eval()
tokenizer = BertWordPieceTokenizer("Reader/electra_base_uncased/vocab.txt",
                                   lowercase=True)

torch.set_grad_enabled(False)
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base")
q_encoder = DPRQuestionEncoder.from_pretrained(
    "Retrieval/question_encoder").to(device=torch.device('cpu'))
q_encoder.eval()

# ctx_tokenizer = BertWordPieceTokenizer("ctx_tokenizer/vocab.txt", lowercase=True)
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base")
ctx_encoder = DPRContextEncoder.from_pretrained("Retrieval/ctx_encoder").to(
    device=torch.device('cpu'))
ctx_encoder.eval()

app = Flask(__name__)


@app.route('/')
def home():
    return render_template('home.html')


@app.route('/', methods=['POST'])
def Answering():
    question = request.form['question']
    answers = get_answer(question, model, tokenizer, q_tokenizer, q_encoder,
                         ctx_tokenizer, ctx_encoder)
Beispiel #12
0
def main(
    rag_example_args: "RagExampleArguments",
    processing_args: "ProcessingArguments",
    index_hnsw_args: "IndexHnswArguments",
):

    ######################################
    logger.info("Step 1 - Create the dataset")
    ######################################

    # The dataset needed for RAG must have three columns:
    # - title (string): title of the document
    # - text (string): text of a passage of the document
    # - embeddings (array of dimension d): DPR representation of the passage

    # Let's say you have documents in tab-separated csv files with columns "title" and "text"
    assert os.path.isfile(
        rag_example_args.csv_path), "Please provide a valid path to a csv file"

    # You can load a Dataset object this way
    dataset = load_dataset("csv",
                           data_files=[rag_example_args.csv_path],
                           split="train",
                           delimiter="\t",
                           column_names=["title", "text"])

    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files

    # Then split the documents into passages of 100 words
    dataset = dataset.map(split_documents,
                          batched=True,
                          num_proc=processing_args.num_proc)

    # And compute the embeddings
    ctx_encoder = DPRContextEncoder.from_pretrained(
        rag_example_args.dpr_ctx_encoder_model_name).to(device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
        rag_example_args.dpr_ctx_encoder_model_name)
    dataset = dataset.map(
        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
        batched=True,
        batch_size=processing_args.batch_size,
    )

    # And finally save your dataset
    passages_path = os.path.join(rag_example_args.output_dir,
                                 "my_knowledge_dataset")
    dataset.save_to_disk(passages_path)
    # from datasets import load_from_disk
    # dataset = load_from_disk(passages_path)  # to reload the dataset

    ######################################
    logger.info("Step 2 - Index the dataset")
    ######################################

    # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
    index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m,
                                faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    # And save the index
    index_path = os.path.join(rag_example_args.output_dir,
                              "my_knowledge_dataset_hnsw_index.faiss")
    dataset.get_index("embeddings").save(index_path)
    # dataset.load_faiss_index("embeddings", index_path)  # to reload the index

    ######################################
    logger.info("Step 3 - Load RAG")
    ######################################

    # Easy way to load the model
    retriever = RagRetriever.from_pretrained(rag_example_args.rag_model_name,
                                             index_name="custom",
                                             indexed_dataset=dataset)
    model = RagSequenceForGeneration.from_pretrained(
        rag_example_args.rag_model_name, retriever=retriever)
    tokenizer = RagTokenizer.from_pretrained(rag_example_args.rag_model_name)

    # For distributed fine-tuning you'll need to provide the paths instead, as the dataset and the index are loaded separately.
    # retriever = RagRetriever.from_pretrained(rag_model_name, index_name="custom", passages_path=passages_path, index_path=index_path)

    ######################################
    logger.info("Step 4 - Have fun")
    ######################################

    question = rag_example_args.question or "What does Moses' rod turn into ?"
    input_ids = tokenizer.question_encoder(question,
                                           return_tensors="pt")["input_ids"]
    generated = model.generate(input_ids)
    generated_string = tokenizer.batch_decode(generated,
                                              skip_special_tokens=True)[0]
    logger.info("Q: " + question)
    logger.info("A: " + generated_string)
        help=
        'directory that contains corpus files to be encoded, in jsonl format.',
        required=True)
    parser.add_argument('--index',
                        type=str,
                        help='directory to store brute force index of corpus',
                        required=True)
    parser.add_argument('--batch', type=int, help='batch size', default=8)
    parser.add_argument('--device',
                        type=str,
                        help='device cpu or cuda [cuda:0, cuda:1...]',
                        default='cuda:0')
    args = parser.parse_args()

    tokenizer = DPRContextEncoderTokenizer.from_pretrained(args.encoder)
    model = DPRContextEncoder.from_pretrained(args.encoder)
    model.to(args.device)

    index = faiss.IndexFlatIP(args.dimension)

    if not os.path.exists(args.index):
        os.mkdir(args.index)

    titles = []
    texts = []
    with open(os.path.join(args.index, 'docid'), 'w') as id_file:
        for file in sorted(os.listdir(args.corpus)):
            file = os.path.join(args.corpus, file)
            if file.endswith('json') or file.endswith('jsonl'):
                print(f'Encoding {file}')
                with open(file, 'r') as corpus:
Beispiel #14
0
    def __init__(self, hparams, **kwargs):
        # when loading from a pytorch lightning checkpoint, hparams are passed as dict
        if isinstance(hparams, dict):
            hparams = AttrDict(hparams)
        if hparams.model_type == "rag_sequence":
            self.model_class = RagSequenceForGeneration
        elif hparams.model_type == "rag_token":
            self.model_class = RagTokenForGeneration
        elif hparams.model_type == "bart":
            self.model_class = BartForConditionalGeneration
        else:
            self.model_class = T5ForConditionalGeneration
        self.is_rag_model = is_rag_model(hparams.model_type)

        config_class = RagConfig if self.is_rag_model else AutoConfig
        config = config_class.from_pretrained(hparams.model_name_or_path)

        # set retriever parameters
        config.index_name = hparams.index_name or config.index_name
        config.passages_path = hparams.passages_path or config.passages_path
        config.index_path = hparams.index_path or config.index_path
        config.use_dummy_dataset = hparams.use_dummy_dataset

        # set extra_model_params for generator configs and load_model
        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop",
                              "attention_dropout", "dropout")
        if self.is_rag_model:
            if hparams.prefix is not None:
                config.generator.prefix = hparams.prefix
            config.label_smoothing = hparams.label_smoothing
            hparams, config.generator = set_extra_model_params(
                extra_model_params, hparams, config.generator)
            if hparams.distributed_retriever == "ray":
                # The Ray retriever needs the handles to the retriever actors.
                retriever = RagRayDistributedRetriever.from_pretrained(
                    hparams.model_name_or_path,
                    hparams.actor_handles,
                    config=config)

                if hparams.end2end:
                    ctx_encoder_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
                        "facebook/dpr-ctx_encoder-multiset-base")
                    retriever.set_ctx_encoder_tokenizer(ctx_encoder_tokenizer)
            else:
                logger.info(
                    "please use RAY as the distributed retrieval method")

            model = self.model_class.from_pretrained(
                hparams.model_name_or_path, config=config, retriever=retriever)
            if hparams.end2end:
                ctx_encoder = DPRContextEncoder.from_pretrained(
                    hparams.context_encoder_name)
                model.set_context_encoder_for_training(ctx_encoder)
            prefix = config.question_encoder.prefix
        else:
            if hparams.prefix is not None:
                config.prefix = hparams.prefix
            hparams, config = set_extra_model_params(extra_model_params,
                                                     hparams, config)
            model = self.model_class.from_pretrained(
                hparams.model_name_or_path, config=config)
            prefix = config.prefix

        tokenizer = (RagTokenizer.from_pretrained(hparams.model_name_or_path)
                     if self.is_rag_model else AutoTokenizer.from_pretrained(
                         hparams.model_name_or_path))

        self.config_dpr = DPRConfig.from_pretrained(
            hparams.context_encoder_name)
        self.custom_config = hparams
        self.context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
            hparams.context_encoder_name)

        super().__init__(hparams,
                         config=config,
                         tokenizer=tokenizer,
                         model=model)

        save_git_info(self.hparams.output_dir)
        self.output_dir = Path(self.hparams.output_dir)
        self.dpr_ctx_check_dir = str(Path(
            self.hparams.output_dir)) + "/dpr_ctx_checkpoint"
        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
        pickle_save(self.hparams, self.hparams_save_path)
        self.step_count = 0
        self.metrics = defaultdict(list)

        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.data_dir,
            max_source_length=self.hparams.max_source_length,
            prefix=prefix or "",
        )
        n_observations_per_split = {
            "train": self.hparams.n_train,
            "val": self.hparams.n_val,
            "test": self.hparams.n_test,
        }
        self.n_obs = {
            k: v if v >= 0 else None
            for k, v in n_observations_per_split.items()
        }
        self.target_lens = {
            "train": self.hparams.max_target_length,
            "val": self.hparams.val_max_target_length,
            "test": self.hparams.test_max_target_length,
        }
        assert self.target_lens["train"] <= self.target_lens[
            "val"], f"target_lens: {self.target_lens}"
        assert self.target_lens["train"] <= self.target_lens[
            "test"], f"target_lens: {self.target_lens}"

        self.hparams.git_sha = get_git_info()["repo_sha"]
        self.num_workers = hparams.num_workers
        self.distributed_port = self.hparams.distributed_port

        # For single GPU training, init_ddp_connection is not called.
        # So we need to initialize the retrievers here.
        if hparams.gpus <= 1:
            if hparams.distributed_retriever == "ray":
                self.model.retriever.init_retrieval()
            else:
                logger.info(
                    "please use RAY as the distributed retrieval method")

        self.distributed_retriever = hparams.distributed_retriever
Beispiel #15
0
class DPRIndex(DocumentChunker):
    '''
    Class for indexing and searching documents, using a combination of
    vectors producted by DPR and keyword matching from Elastic TF-IDF. As a
    subclass of DocumentChunker, this class automatically handles document
    chunking as well.
    '''

    INDEX_NAME = 'dense-passage-retrieval'
    D = 768
    context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
        'facebook/dpr-ctx_encoder-single-nq-base')
    context_model = DPRContextEncoder.from_pretrained(
        'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
        'facebook/dpr-question_encoder-single-nq-base')
    question_model = DPRQuestionEncoder.from_pretrained(
        'facebook/dpr-question_encoder-single-nq-base', return_dict=True)

    def __init__(self, documents: List[DPRDocument]):
        super(DocumentChunker).__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.device == 'cuda':
            self.reader_model = self.reader_model.cuda()
        self.faiss_index = faiss.IndexFlatIP(self.D)
        self._setup_elastic_index()
        self._build_index(documents)

    def _setup_elastic_index(self):
        '''Sets up the Elastic Index. Deletes old ones if needed.'''
        self.es = Elasticsearch()
        if self.es.indices.exists(self.INDEX_NAME):
            logging.warning(f'Deleting old index for {self.INDEX_NAME}.')
            self.es.indices.delete(self.INDEX_NAME)
        self.es.indices.create(index=self.INDEX_NAME)

    def _build_index(self, documents):
        '''
        Initializes the data structure to keep track of which chunks
        correspond to which documents.
        '''
        self.documents = documents
        self.doc_bodies = [doc.body for doc in self.documents]
        self.chunks = []
        self.chunk_index = {}  # {chunk: document}
        self.inverse_chunk_index = {}  # {document: [chunks]}
        chunk_counter = 0
        for doc_counter, doc_body in tqdm(enumerate(self.doc_bodies),
                                          total=len(self.doc_bodies)):
            self.inverse_chunk_index[doc_counter] = []
            chunked_docs = self.chunk_document(doc_body)
            self.chunks.extend(chunked_docs)
            for chunked_doc in chunked_docs:
                chunk_embedding = self.embed_context(chunked_doc)
                self.faiss_index.add(chunk_embedding)
                self.es.create(self.INDEX_NAME,
                               id=chunk_counter,
                               body={'chunk': chunked_doc})
                self.chunk_index[chunk_counter] = doc_counter
                self.inverse_chunk_index[doc_counter].append(chunk_counter)
                chunk_counter += 1
        self.total_docs = len(self.documents)
        self.total_chunks = len(self.chunks)

    def embed_question(self, question: str):
        '''Embed the question in vector space with the question encoder.'''
        input_ids = self.question_tokenizer(question,
                                            return_tensors='pt')['input_ids']
        embeddings = self.question_model(
            input_ids).pooler_output.detach().numpy()
        return embeddings

    def embed_context(self, context: str):
        '''Embed the context (doc) in vector space with the question encoder.'''
        input_ids = self.context_tokenizer(context,
                                           return_tensors='pt')['input_ids']
        embeddings = self.context_model(
            input_ids).pooler_output.detach().numpy()
        return embeddings

    def search_dense_index(self, question: str, k: int = 5):
        '''
        Search the vector index by encoding the question and then performing
        nearest neighbor on the FAISS index of context vectors.

        Args:
            question (str):
                The natural language question, e.g. `who is bill gates?`
            k (int):
                The number of documents to return from the index.
        '''
        if k > self.total_chunks:
            k = self.total_chunks
        question_embedding = self.embed_question(question)
        dists, chunk_ids = self.faiss_index.search(question_embedding, k=k)
        dists, chunk_ids = list(dists[0]), list(chunk_ids[0])
        dists = list(map(float, dists))  # For Flask
        structured_response = []
        for dist, chunk_id in zip(dists, chunk_ids):
            chunk = self.chunks[chunk_id]
            document_id = self.chunk_index[chunk_id]
            document = self.documents[document_id]
            blob = {
                'document': document,
                'document_id': document_id,
                'chunk': chunk,
                'chunk_id': int(chunk_id),  # For Flask
                'faiss_dist': dist
            }
            structured_response.append(blob)
        return structured_response

    def search_sparse_index(self, query):
        body = {'size': 10, 'query': {'match': {'chunk': query}}}
        results = self.es.search(index=self.INDEX_NAME, body=body)
        hits = results['hits']['hits']
        return hits

    def _merge_results(self, sparse_results, dense_results):
        '''Merges the results of sparse and dense retrieval.'''
        results_index = {}
        for sparse_result in sparse_results:
            id, score = sparse_result['_id'], sparse_result['_score']
            id = int(id)
            results_index[id] = {'elastic_score': score}
        for dense_result in dense_results:
            id, score = dense_result['chunk_id'], dense_result['faiss_dist']
            if id in results_index:
                results_index[id]['faiss_dist'] = score
            else:
                results_index[id] = {'faiss_dist': score}
        results = []
        for chunk_id, scores in results_index.items():
            document_id = self.chunk_index[chunk_id]
            document = self.documents[document_id]
            chunk = self.chunks[chunk_id]
            doc_profile = document.to_dict()
            result = {
                'chunk_id': chunk_id,
                'chunk': chunk,
                'document_id': document_id,
                'document': doc_profile,
                'scores': scores
            }
            results.append(result)
        return results

    def search_dual_index(self, query: str):
        '''Search both the sparse and dense indices and merge the results.'''
        sparse_result = self.search_sparse_index(query)
        dense_result = self.search_dense_index(query)
        merged_results = self._merge_results(sparse_result, dense_result)
        return merged_results
Beispiel #16
0
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from CustomDPRDataset import CustomDPRDataset
from tqdm import tqdm
import sys

from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer, AdamW, get_linear_schedule_with_warmup

# initialize tokenizers and models for context encoder and question encoder
context_name = 'facebook/dpr-ctx_encoder-multiset-base'  # set to what context encoder we want to use
question_name = 'facebook/dpr-question_encoder-multiset-base'  # set to what question encoder we want to use
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(context_name)
context_model = DPRContextEncoder.from_pretrained(context_name).cuda()
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(question_name)
question_model = DPRQuestionEncoder.from_pretrained(question_name).cuda()

nll = nn.NLLLoss()
# question_model.half()
# context_model.half()

# params
batch_size = 256
grad_accum = 8
lr = 1e-5
text_descrip = "batchsize256_gradaccum8_v2"

print("intialized models/tokenizers")

# initialize dataset
Beispiel #17
0
 def __init__(self, model_name, tokenizer_name=None, device='cuda:0'):
     self.device = device
     self.model = DPRContextEncoder.from_pretrained(model_name)
     self.model.to(self.device)
     self.tokenizer = DPRContextEncoderTokenizer.from_pretrained(
         tokenizer_name or model_name)