def init(self, rank, root, experiment, name): assert '/' not in experiment, experiment assert '/' not in name, name self.experiments_root = os.path.abspath(root) self.experiment = experiment self.name = name self.path = os.path.join(self.experiments_root, self.experiment, self.script, self.name) if rank < 1: if os.path.exists(self.path): print('\n\n') print_message("It seems that ", self.path, " already exists.") print_message("Do you want to overwrite it? \t yes/no \n") # TODO: This should timeout and exit (i.e., fail) given no response for 60 seconds. response = input() if response.strip() != 'yes': assert not os.path.exists(self.path), self.path else: create_directory(self.path) distributed.barrier(rank) self._logger = Logger(rank, self) self._log_args = self._logger._log_args self.warn = self._logger.warn self.info = self._logger.info self.info_all = self._logger.info_all self.log_metric = self._logger.log_metric self.log_new_artifact = self._logger.log_new_artifact
def __init__(self, rank, run): self.rank = rank self.is_main = self.rank in [-1, 0] self.run = run self.logs_path = os.path.join(self.run.path, "logs/") if self.is_main: self._init_mlflow() self.initialized_tensorboard = False create_directory(self.logs_path)
def main(): random.seed(12345) parser = Arguments( description='Precomputing document representations with ColBERT.') parser.add_model_parameters() parser.add_model_inference_parameters() parser.add_indexing_input() parser.add_argument('--chunksize', dest='chunksize', default=6.0, required=False, type=float) # in GiBs args = parser.parse() with Run.context(): args.index_path = os.path.join(args.index_root, args.index_name) # try: assert not os.path.exists(args.index_path), args.index_path # except: # print("\n\nNOT EXISTING:", args.index_path, args.index_path, '\n\n') distributed.barrier(args.rank) if args.rank < 1: create_directory(args.index_root) create_directory(args.index_path) distributed.barrier(args.rank) process_idx = max(0, args.rank) encoder = CollectionEncoder(args, process_idx=process_idx, num_processes=args.nranks) encoder.encode() distributed.barrier(args.rank) # Save metadata. if args.rank < 1: metadata_path = os.path.join(args.index_path, 'metadata.json') print_message("Saving (the following) metadata to", metadata_path, "..") print(args.input_arguments) with open(metadata_path, 'w') as output_metadata: ujson.dump(args.input_arguments.__dict__, output_metadata) distributed.barrier(args.rank)
def index(self, iterator): from timeit import default_timer as timer starttime = timer() maxdocs = 100 assert not os.path.exists(self.args.index_path), self.args.index_path docnos = [] docid = 0 def convert_gen(iterator): import pyterrier as pt nonlocal docnos nonlocal docid if self.num_docs is not None: iterator = pt.tqdm(iterator, total=self.num_docs, desc="encoding", unit="d") for l in iterator: l["docid"] = docid docnos.append(l['docno']) docid += 1 yield l self.args.generator = convert_gen(iterator) ceg = CollectionEncoder_Generator(self.prepend_title, self.args, 0, 1) create_directory(self.args.index_root) create_directory(self.args.index_path) ceg.encode() self.colbert = ceg.colbert self.checkpoint = ceg.checkpoint assert os.path.exists(self.args.index_path), self.args.index_path num_embeddings = sum(load_doclens(self.args.index_path)) print("#> num_embeddings =", num_embeddings) import pyterrier as pt with pt.io.autoopen( os.path.join(self.args.index_path, "docnos.pkl.gz"), "wb") as f: pickle.dump(docnos, f) if self.args.partitions is None: self.args.partitions = 1 << math.ceil( math.log2(8 * math.sqrt(num_embeddings))) warn("You did not specify --partitions!") warn("Default computation chooses", self.args.partitions, "partitions (for {} embeddings)".format(num_embeddings)) index_faiss(self.args) print("#> Faiss encoding complete") endtime = timer() print("#> Indexing complete, Time elapsed %0.2f seconds" % (endtime - starttime))
save_metadata(f'{args.output}.meta', args) print('\n\n', args, '\n\n') print(args.output) print_message("#> Done.") if __name__ == "__main__": random.seed(12345) parser = ArgumentParser(description='.') # Input / Output Arguments parser.add_argument('--metric', dest='metric', required=True, type=str) # e.g., success.20 parser.add_argument('--paths', dest='paths', required=True, type=str, nargs='+') parser.add_argument('--output', dest='output', required=True, type=str) args = parser.parse_args() args.metric = args.metric.split('.') assert not os.path.exists(args.output), args.output create_directory(os.path.dirname(args.output)) main(args)