def precache_files(self, contest_id): """RPC to ask the worker to precache of files in the contest. contest_id (int): the id of the contest """ # In order to avoid a long-living connection, first fetch the # complete list of files and then download the files; since # this is just pre-caching, possible race conditions are not # dangerous logger.info("Precaching files for contest %d.", contest_id) with SessionGen() as session: contest = Contest.get_from_id(contest_id, session) files = enumerate_files(session, contest, skip_submissions=True, skip_user_tests=True, skip_print_jobs=True) for digest in files: try: self.file_cacher.load(digest, if_needed=True) except KeyError: # No problem (at this stage) if we cannot find the # file pass logger.info("Precaching finished.")
def clean_files(session, dry_run): filecacher = FileCacher() files = set(file[0] for file in filecacher.list()) logger.info("A total number of %d files are present in the file store", len(files)) found_digests = enumerate_files(session) logger.info("Found %d digests while scanning", len(found_digests)) files -= found_digests logger.info("%d digests are orphan.", len(files)) total_size = 0 for orphan in files: total_size += filecacher.get_size(orphan) logger.info("Orphan files take %s bytes of disk space", "{:,}".format(total_size)) if not dry_run: for count, orphan in enumerate(files): filecacher.delete(orphan) if count % 100 == 0: logger.info("%d files deleted from the file store", count) logger.info("All orphan files have been deleted")
def precache_files(self, contest_id): """RPC to ask the worker to precache of files in the contest. contest_id (int): the id of the contest """ lock = self.file_cacher.precache_lock() if lock is None: # Another worker is already precaching. Hence, this worker doesn't # need to do anything. logger.info( "Another worker is already precaching files for " "contest %d.", contest_id) return with lock: # In order to avoid a long-living connection, first fetch the # complete list of files and then download the files; since # this is just pre-caching, possible race conditions are not # dangerous logger.info("Precaching files for contest %d.", contest_id) with SessionGen() as session: contest = Contest.get_from_id(contest_id, session) files = enumerate_files(session, contest, skip_submissions=True, skip_user_tests=True, skip_print_jobs=True) for digest in files: try: self.file_cacher.cache_file(digest) except KeyError: # No problem (at this stage) if we cannot find the # file pass logger.info("Precaching finished.")
def do_export(self): """Run the actual export code.""" logger.info("Starting export.") export_dir = self.export_target archive_info = get_archive_info(self.export_target) if archive_info["write_mode"] != "": # We are able to write to this archive. if os.path.exists(self.export_target): logger.critical("The specified file already exists, " "I won't overwrite it.") return False export_dir = os.path.join(tempfile.mkdtemp(), archive_info["basename"]) logger.info("Creating dir structure.") try: os.mkdir(export_dir) except OSError: logger.critical("The specified directory already exists, " "I won't overwrite it.") return False files_dir = os.path.join(export_dir, "files") descr_dir = os.path.join(export_dir, "descriptions") os.mkdir(files_dir) os.mkdir(descr_dir) with SessionGen() as session: # Export files. logger.info("Exporting files.") if self.dump_files: for contest_id in self.contests_ids: contest = Contest.get_from_id(contest_id, session) files = enumerate_files( session, contest, skip_submissions=self.skip_submissions, skip_user_tests=self.skip_user_tests, skip_users=self.skip_users, skip_print_jobs=self.skip_print_jobs, skip_generated=self.skip_generated) for file_ in files: if not self.safe_get_file(file_, os.path.join(files_dir, file_), os.path.join(descr_dir, file_)): return False # Export data in JSON format. if self.dump_model: logger.info("Exporting data to a JSON file.") # We use strings because they'll be the keys of a JSON # object self.ids = {} self.queue = [] data = dict() for cls, lst in [(Contest, self.contests_ids), (User, self.users_ids), (Task, self.tasks_ids)]: for i in lst: obj = cls.get_from_id(i, session) self.get_id(obj) # Specify the "root" of the data graph data["_objects"] = list(self.ids.values()) while len(self.queue) > 0: obj = self.queue.pop(0) data[self.ids[obj.sa_identity_key]] = \ self.export_object(obj) data["_version"] = model_version destination = os.path.join(export_dir, "contest.json") with open(destination, "wt", encoding="utf-8") as fout: json.dump(data, fout, indent=4, sort_keys=True) # If the admin requested export to file, we do that. if archive_info["write_mode"] != "": with tarfile.open(self.export_target, archive_info["write_mode"]) as archive: archive.add(export_dir, arcname=archive_info["basename"]) rmtree(export_dir) logger.info("Export finished.") return True
def do_import(self): """Run the actual import code.""" logger.info("Starting import.") archive = None if Archive.is_supported(self.import_source): archive = Archive(self.import_source) self.import_dir = archive.unpack() file_names = os.listdir(self.import_dir) if len(file_names) != 1: logger.critical("Cannot find a root directory in %s.", self.import_source) archive.cleanup() return False self.import_dir = os.path.join(self.import_dir, file_names[0]) if self.drop: logger.info("Dropping and recreating the database.") try: if not (drop_db() and init_db()): logger.critical( "Unexpected error while dropping " "and recreating the database.", exc_info=True) return False except Exception: logger.critical("Unable to access DB.", exc_info=True) return False with SessionGen() as session: # Import the contest in JSON format. if self.load_model: logger.info("Importing the contest from a JSON file.") with io.open(os.path.join(self.import_dir, "contest.json"), "rb") as fin: # TODO - Throughout all the code we'll assume the # input is correct without actually doing any # validations. Thus, for example, we're not # checking that the decoded object is a dict... self.datas = json.load(fin) # If the dump has been exported using a data model # different than the current one (that is, a previous # one) we try to update it. # If no "_version" field is found we assume it's a v1.0 # export (before the new dump format was introduced). dump_version = self.datas.get("_version", 0) if dump_version < model_version: logger.warning( "The dump you're trying to import has been created " "by an old version of CMS (it declares data model " "version %d). It may take a while to adapt it to " "the current data model (which is version %d). You " "can use cmsDumpUpdater to update the on-disk dump " "and speed up future imports.", dump_version, model_version) elif dump_version > model_version: logger.critical( "The dump you're trying to import has been created " "by a version of CMS newer than this one (it " "declares data model version %d) and there is no " "way to adapt it to the current data model (which " "is version %d). You probably need to update CMS to " "handle it. It is impossible to proceed with the " "importation.", dump_version, model_version) return False else: logger.info("Importing dump with data model version %d.", dump_version) for version in range(dump_version, model_version): # Update from version to version+1 updater = __import__( "cmscontrib.updaters.update_%d" % (version + 1), globals(), locals(), ["Updater"]).Updater(self.datas) self.datas = updater.run() self.datas["_version"] = version + 1 assert self.datas["_version"] == model_version self.objs = dict() for id_, data in iteritems(self.datas): if not id_.startswith("_"): self.objs[id_] = self.import_object(data) for id_, data in iteritems(self.datas): if not id_.startswith("_"): self.add_relationships(data, self.objs[id_]) for k, v in list(iteritems(self.objs)): # Skip submissions if requested if self.skip_submissions and isinstance(v, Submission): del self.objs[k] # Skip user_tests if requested if self.skip_user_tests and isinstance(v, UserTest): del self.objs[k] # Skip print jobs if requested if self.skip_print_jobs and isinstance(v, PrintJob): del self.objs[k] # Skip generated data if requested if self.skip_generated and \ isinstance(v, (SubmissionResult, UserTestResult)): del self.objs[k] contest_id = list() contest_files = set() # We add explicitly only the top-level objects: # contests, and tasks and users not contained in any # contest. This will add on cascade all dependent # objects, and not add orphaned objects (like those # that depended on submissions or user tests that we # might have removed above). for id_ in self.datas["_objects"]: obj = self.objs[id_] session.add(obj) session.flush() if isinstance(obj, Contest): contest_id += [obj.id] contest_files |= enumerate_files( session, obj, skip_submissions=self.skip_submissions, skip_user_tests=self.skip_user_tests, skip_print_jobs=self.skip_print_jobs, skip_generated=self.skip_generated) session.commit() else: contest_id = None contest_files = None # Import files. if self.load_files: logger.info("Importing files.") files_dir = os.path.join(self.import_dir, "files") descr_dir = os.path.join(self.import_dir, "descriptions") files = set(os.listdir(files_dir)) descr = set(os.listdir(descr_dir)) if not descr <= files: logger.warning("Some files do not have an associated " "description.") if not files <= descr: logger.warning("Some descriptions do not have an " "associated file.") if not (contest_files is None or files <= contest_files): # FIXME Check if it's because this is a light import # or because we're skipping submissions or user_tests logger.warning("The dump contains some files that are " "not needed by the contest.") if not (contest_files is None or contest_files <= files): # The reason for this could be that it was a light # export that's not being reimported as such. logger.warning("The contest needs some files that are " "not contained in the dump.") # Limit import to files we actually need. if contest_files is not None: files &= contest_files for digest in files: file_ = os.path.join(files_dir, digest) desc = os.path.join(descr_dir, digest) if not self.safe_put_file(file_, desc): logger.critical( "Unable to put file `%s' in the DB. " "Aborting. Please remove the contest " "from the database.", file_) # TODO: remove contest from the database. return False # Clean up, if an archive was used if archive is not None: archive.cleanup() if contest_id is not None: logger.info("Import finished (contest id: %s).", ", ".join("%d" % id_ for id_ in contest_id)) else: logger.info("Import finished.") return True
def do_import(self): """Run the actual import code.""" logger.info("Starting import.") archive = None if Archive.is_supported(self.import_source): archive = Archive(self.import_source) self.import_dir = archive.unpack() file_names = os.listdir(self.import_dir) if len(file_names) != 1: logger.critical("Cannot find a root directory in %s.", self.import_source) archive.cleanup() return False self.import_dir = os.path.join(self.import_dir, file_names[0]) if self.drop: logger.info("Dropping and recreating the database.") try: if not (drop_db() and init_db()): logger.critical("Unexpected error while dropping " "and recreating the database.", exc_info=True) return False except Exception: logger.critical("Unable to access DB.", exc_info=True) return False with SessionGen() as session: # Import the contest in JSON format. if self.load_model: logger.info("Importing the contest from a JSON file.") with io.open(os.path.join(self.import_dir, "contest.json"), "rb") as fin: # TODO - Throughout all the code we'll assume the # input is correct without actually doing any # validations. Thus, for example, we're not # checking that the decoded object is a dict... self.datas = json.load(fin) # If the dump has been exported using a data model # different than the current one (that is, a previous # one) we try to update it. # If no "_version" field is found we assume it's a v1.0 # export (before the new dump format was introduced). dump_version = self.datas.get("_version", 0) if dump_version < model_version: logger.warning( "The dump you're trying to import has been created " "by an old version of CMS (it declares data model " "version %d). It may take a while to adapt it to " "the current data model (which is version %d). You " "can use cmsDumpUpdater to update the on-disk dump " "and speed up future imports.", dump_version, model_version) elif dump_version > model_version: logger.critical( "The dump you're trying to import has been created " "by a version of CMS newer than this one (it " "declares data model version %d) and there is no " "way to adapt it to the current data model (which " "is version %d). You probably need to update CMS to " "handle it. It is impossible to proceed with the " "importation.", dump_version, model_version) return False else: logger.info( "Importing dump with data model version %d.", dump_version) for version in range(dump_version, model_version): # Update from version to version+1 updater = __import__( "cmscontrib.updaters.update_%d" % (version + 1), globals(), locals(), ["Updater"]).Updater(self.datas) self.datas = updater.run() self.datas["_version"] = version + 1 assert self.datas["_version"] == model_version self.objs = dict() for id_, data in iteritems(self.datas): if not id_.startswith("_"): self.objs[id_] = self.import_object(data) for id_, data in iteritems(self.datas): if not id_.startswith("_"): self.add_relationships(data, self.objs[id_]) for k, v in list(iteritems(self.objs)): # Skip submissions if requested if self.skip_submissions and isinstance(v, Submission): del self.objs[k] # Skip user_tests if requested if self.skip_user_tests and isinstance(v, UserTest): del self.objs[k] # Skip print jobs if requested if self.skip_print_jobs and isinstance(v, PrintJob): del self.objs[k] # Skip generated data if requested if self.skip_generated and \ isinstance(v, (SubmissionResult, UserTestResult)): del self.objs[k] contest_id = list() contest_files = set() # We add explicitly only the top-level objects: # contests, and tasks and users not contained in any # contest. This will add on cascade all dependent # objects, and not add orphaned objects (like those # that depended on submissions or user tests that we # might have removed above). for id_ in self.datas["_objects"]: obj = self.objs[id_] session.add(obj) session.flush() if isinstance(obj, Contest): contest_id += [obj.id] contest_files |= enumerate_files( session, obj, skip_submissions=self.skip_submissions, skip_user_tests=self.skip_user_tests, skip_print_jobs=self.skip_print_jobs, skip_generated=self.skip_generated) session.commit() else: contest_id = None contest_files = None # Import files. if self.load_files: logger.info("Importing files.") files_dir = os.path.join(self.import_dir, "files") descr_dir = os.path.join(self.import_dir, "descriptions") files = set(os.listdir(files_dir)) descr = set(os.listdir(descr_dir)) if not descr <= files: logger.warning("Some files do not have an associated " "description.") if not files <= descr: logger.warning("Some descriptions do not have an " "associated file.") if not (contest_files is None or files <= contest_files): # FIXME Check if it's because this is a light import # or because we're skipping submissions or user_tests logger.warning("The dump contains some files that are " "not needed by the contest.") if not (contest_files is None or contest_files <= files): # The reason for this could be that it was a light # export that's not being reimported as such. logger.warning("The contest needs some files that are " "not contained in the dump.") # Limit import to files we actually need. if contest_files is not None: files &= contest_files for digest in files: file_ = os.path.join(files_dir, digest) desc = os.path.join(descr_dir, digest) if not self.safe_put_file(file_, desc): logger.critical("Unable to put file `%s' in the DB. " "Aborting. Please remove the contest " "from the database.", file_) # TODO: remove contest from the database. return False # Clean up, if an archive was used if archive is not None: archive.cleanup() if contest_id is not None: logger.info("Import finished (contest id: %s).", ", ".join("%d" % id_ for id_ in contest_id)) else: logger.info("Import finished.") return True
def do_export(self): """Run the actual export code.""" logger.info("Starting export.") export_dir = self.export_target archive_info = get_archive_info(self.export_target) if archive_info["write_mode"] != "": # We are able to write to this archive. if os.path.exists(self.export_target): logger.critical("The specified file already exists, " "I won't overwrite it.") return False export_dir = os.path.join(tempfile.mkdtemp(), archive_info["basename"]) logger.info("Creating dir structure.") try: os.mkdir(export_dir) except OSError: logger.critical("The specified directory already exists, " "I won't overwrite it.") return False files_dir = os.path.join(export_dir, "files") descr_dir = os.path.join(export_dir, "descriptions") os.mkdir(files_dir) os.mkdir(descr_dir) with SessionGen() as session: # Export files. logger.info("Exporting files.") if self.dump_files: for contest_id in self.contests_ids: contest = Contest.get_from_id(contest_id, session) files = enumerate_files( session, contest, skip_submissions=self.skip_submissions, skip_user_tests=self.skip_user_tests, skip_print_jobs=self.skip_print_jobs, skip_generated=self.skip_generated) for file_ in files: if not self.safe_get_file(file_, os.path.join(files_dir, file_), os.path.join(descr_dir, file_)): return False # Export data in JSON format. if self.dump_model: logger.info("Exporting data to a JSON file.") # We use strings because they'll be the keys of a JSON # object self.ids = {} self.queue = [] data = dict() for cls, lst in [(Contest, self.contests_ids), (User, self.users_ids), (Task, self.tasks_ids)]: for i in lst: obj = cls.get_from_id(i, session) self.get_id(obj) # Specify the "root" of the data graph data["_objects"] = list(itervalues(self.ids)) while len(self.queue) > 0: obj = self.queue.pop(0) data[self.ids[obj.sa_identity_key]] = \ self.export_object(obj) data["_version"] = model_version destination = os.path.join(export_dir, "contest.json") if PY3: with io.open(destination, "wt", encoding="utf-8") as fout: json.dump(data, fout, indent=4, sort_keys=True) else: with io.open(destination, "wb") as fout: json.dump(data, fout, indent=4, sort_keys=True) # If the admin requested export to file, we do that. if archive_info["write_mode"] != "": with tarfile.open(self.export_target, archive_info["write_mode"]) as archive: archive.add(export_dir, arcname=archive_info["basename"]) rmtree(export_dir) logger.info("Export finished.") return True