Ejemplo n.º 1
0
 def load_index(self, opt, shared):
     if not shared:
         self.indexer = indexer_factory(opt)
         index_path = modelzoo_path(opt['datapath'], opt['path_to_index'])
         passages_path = modelzoo_path(opt['datapath'], opt['path_to_dpr_passages'])
         embeddings_path = None
         if opt['path_to_dense_embeddings'] is not None:
             embeddings_path = modelzoo_path(
                 opt['datapath'], opt['path_to_dense_embeddings']
             )
         self.indexer.deserialize_from(index_path, embeddings_path)
         self.passages = load_passages_dict(passages_path)
     elif shared:
         self.indexer = shared['indexer']
         self.passages = shared['passages']
    def run(self):
        """
        Load dense embeddings and index with FAISS.
        """
        # create index
        index_dir = self.opt['embeddings_dir']
        embs_name = (
            f"{self.opt['embeddings_name']}_" if self.opt['embeddings_name'] else ''
        )
        num_parts = len(
            [
                f
                for f in os.listdir(index_dir)
                if f.endswith('.pt') and 'sample' not in f
            ]
        )
        input_files = [
            os.path.join(index_dir, f'{embs_name}{i}.pt') for i in range(num_parts)
        ]
        if self.opt['indexer_type'] == 'compressed':
            index_name = self.opt['compressed_indexer_factory'].replace(',', '__')
        elif self.opt['embeddings_name']:
            index_name = self.opt['embeddings_name']
        else:
            index_name = 'hnsw_flat'
        index_path = os.path.join(index_dir, index_name)

        if self.opt['save_index_dir']:
            index_path, index_name = os.path.split(index_path)
            index_path = os.path.join(self.opt['save_index_dir'], index_name)
            if not os.path.exists(self.opt['save_index_dir']):
                logging.info(f'Creating directory for file {index_path}')
                os.makedirs(self.opt['save_index_dir'])

        logging.info(f'index path: {index_path}')
        self.index_path = index_path

        self.index = indexer_factory(self.opt)
        if self.opt['indexer_type'] != 'exact':
            self.train_then_add(input_files)
        else:
            self.index_data(input_files)
        # save data
        self.index.serialize(index_path)
Ejemplo n.º 3
0
 def __init__(self, opt: Opt, dictionary: DictionaryAgent, shared=None):
     """
     Initialize DPR Retriever.
     """
     super().__init__(opt, dictionary, shared=shared)
     if not shared:
         self.indexer = indexer_factory(opt)
         index_path = modelzoo_path(opt['datapath'], opt['path_to_index'])
         passages_path = modelzoo_path(opt['datapath'],
                                       opt['path_to_dpr_passages'])
         embeddings_path = None
         if opt['path_to_dense_embeddings'] is not None:
             embeddings_path = modelzoo_path(
                 opt['datapath'], opt['path_to_dense_embeddings'])
         self.indexer.deserialize_from(index_path, embeddings_path)
         self.passages = load_passages_dict(passages_path)
     elif shared:
         self.indexer = shared['indexer']
         self.passages = shared['passages']
     self.n_docs = opt['n_docs']
     self.query_encoder = DprQueryEncoder(
         opt,
         dpr_model=opt['query_model'],
         pretrained_path=opt['dpr_model_file'])