def load_candidates(entity_catalogue, entity_encoding, faiss_index=None, index_path=None, logger=None): # only load candidate encoding if not using faiss index if faiss_index is None: candidate_encoding = torch.load(entity_encoding) indexer = None else: if logger: logger.info("Using faiss index to retrieve entities.") candidate_encoding = None assert index_path is not None, "Error! Empty indexer path." if faiss_index == "flat": indexer = DenseFlatIndexer(1) elif faiss_index == "hnsw": indexer = DenseHNSWFlatIndexer(1) else: raise ValueError( "Error! Unsupported indexer type! Choose from flat,hnsw.") indexer.deserialize_from(index_path) # load all the 5903527 entities title2id = {} id2title = {} id2text = {} wikipedia_id2local_id = {} local_idx = 0 with open(entity_catalogue, "r") as fin: lines = fin.readlines() for line in lines: entity = json.loads(line) if "idx" in entity: split = entity["idx"].split("curid=") if len(split) > 1: wikipedia_id = int(split[-1].strip()) else: wikipedia_id = entity["idx"].strip() assert wikipedia_id not in wikipedia_id2local_id wikipedia_id2local_id[wikipedia_id] = local_idx title2id[entity["title"]] = local_idx id2title[local_idx] = entity["title"] id2text[local_idx] = entity["text"] local_idx += 1 return ( candidate_encoding, title2id, id2title, id2text, wikipedia_id2local_id, indexer, )
def main(params): output_path = params["output_path"] if not os.path.exists(output_path): os.makedirs(output_path) logger = utils.get_logger(output_path) logger.info("Loading candidate encoding from path: %s" % params["candidate_encoding"]) candidate_encoding = torch.load(params["candidate_encoding"]) vector_size = candidate_encoding.size(1) index_buffer = params["index_buffer"] if params["hnsw"]: logger.info("Using HNSW index in FAISS") index = DenseHNSWFlatIndexer(vector_size, index_buffer) else: logger.info("Using Flat index in FAISS") index = DenseFlatIndexer(vector_size, index_buffer) logger.info("Building index.") index.index_data(candidate_encoding.numpy()) logger.info("Done indexing data.") if params.get("save_index", None): index.serialize(output_path)