class DumpExporter: """This service exports every data that CMS knows. The process of exporting and importing again should be idempotent. """ def __init__(self, contest_ids, export_target, dump_files, dump_model, skip_generated, skip_submissions, skip_user_tests, skip_users, skip_print_jobs): if contest_ids is None: with SessionGen() as session: contests = session.query(Contest).all() self.contests_ids = [contest.id for contest in contests] if not skip_users: users = session.query(User).all() self.users_ids = [user.id for user in users] tasks = session.query(Task)\ .filter(Task.contest_id.is_(None)).all() else: self.user_ids = [] self.tasks_ids = [task.id for task in tasks] else: # FIXME: this is ATM broken, because if you export a contest, you # then export the users who participated in it and then all of the # contests those users participated in. self.contests_ids = contest_ids self.users_ids = [] self.tasks_ids = [] self.dump_files = dump_files self.dump_model = dump_model self.skip_generated = skip_generated self.skip_submissions = skip_submissions self.skip_user_tests = skip_user_tests self.skip_users = skip_users self.skip_print_jobs = skip_print_jobs self.export_target = export_target # If target is not provided, we use the contest's name. if len(export_target) == 0: self.export_target = "dump_%s.tar.gz" % date.today().isoformat() logger.warning("export_target not given, using \"%s\"", self.export_target) self.file_cacher = FileCacher() def do_export(self): """Run the actual export code.""" logger.info("Starting export.") export_dir = self.export_target archive_info = get_archive_info(self.export_target) if archive_info["write_mode"] != "": # We are able to write to this archive. if os.path.exists(self.export_target): logger.critical("The specified file already exists, " "I won't overwrite it.") return False export_dir = os.path.join(tempfile.mkdtemp(), archive_info["basename"]) logger.info("Creating dir structure.") try: os.mkdir(export_dir) except OSError: logger.critical("The specified directory already exists, " "I won't overwrite it.") return False files_dir = os.path.join(export_dir, "files") descr_dir = os.path.join(export_dir, "descriptions") os.mkdir(files_dir) os.mkdir(descr_dir) with SessionGen() as session: # Export files. logger.info("Exporting files.") if self.dump_files: for contest_id in self.contests_ids: contest = Contest.get_from_id(contest_id, session) files = enumerate_files( session, contest, skip_submissions=self.skip_submissions, skip_user_tests=self.skip_user_tests, skip_users=self.skip_users, skip_print_jobs=self.skip_print_jobs, skip_generated=self.skip_generated) for file_ in files: if not self.safe_get_file(file_, os.path.join(files_dir, file_), os.path.join(descr_dir, file_)): return False # Export data in JSON format. if self.dump_model: logger.info("Exporting data to a JSON file.") # We use strings because they'll be the keys of a JSON # object self.ids = {} self.queue = [] data = dict() for cls, lst in [(Contest, self.contests_ids), (User, self.users_ids), (Task, self.tasks_ids)]: for i in lst: obj = cls.get_from_id(i, session) self.get_id(obj) # Specify the "root" of the data graph data["_objects"] = list(self.ids.values()) while len(self.queue) > 0: obj = self.queue.pop(0) data[self.ids[obj.sa_identity_key]] = \ self.export_object(obj) data["_version"] = model_version destination = os.path.join(export_dir, "contest.json") with open(destination, "wt", encoding="utf-8") as fout: json.dump(data, fout, indent=4, sort_keys=True) # If the admin requested export to file, we do that. if archive_info["write_mode"] != "": with tarfile.open(self.export_target, archive_info["write_mode"]) as archive: archive.add(export_dir, arcname=archive_info["basename"]) rmtree(export_dir) logger.info("Export finished.") return True def get_id(self, obj): obj_key = obj.sa_identity_key if obj_key not in self.ids: # We use strings because they'll be the keys of a JSON object self.ids[obj_key] = "%d" % len(self.ids) self.queue.append(obj) return self.ids[obj_key] def export_object(self, obj): """Export the given object, returning a JSON-encodable dict. The returned dict will contain a "_class" item (the name of the class of the given object), an item for each column property (with a value properly translated to a JSON-compatible type) and an item for each relationship property (which will be an ID or a collection of IDs). The IDs used in the exported dict aren't related to the ones used in the DB: they are newly generated and their scope is limited to the exported file only. They are shared among all classes (that is, two objects can never share the same ID, even if they are of different classes). If, when exporting the relationship, we find an object without an ID we generate a new ID, assign it to the object and append the object to the queue of objects to export. The self.skip_submissions flag controls whether we export submissions (and all other objects that can be reached only by passing through a submission) or not. """ cls = type(obj) data = {"_class": cls.__name__} for prp in cls._col_props: col, = prp.columns val = getattr(obj, prp.key) data[prp.key] = encode_value(col.type, val) user_related_classes = [User, Admin, UserTest, Submission, PrintJob, Message, Question, Announcement, Participation] for prp in cls._rel_props: other_cls = prp.mapper.class_ # Skip submissions if requested if self.skip_submissions and other_cls is Submission: continue # Skip user_tests if requested if self.skip_user_tests and other_cls is UserTest: continue if self.skip_users: skip = False for rel_class in user_related_classes: if other_cls is rel_class: skip = True break if skip: continue # Skip print jobs if requested if self.skip_print_jobs and other_cls is PrintJob: continue # Skip generated data if requested if self.skip_generated and other_cls in (SubmissionResult, UserTestResult): continue val = getattr(obj, prp.key) if val is None: data[prp.key] = None elif isinstance(val, other_cls): data[prp.key] = self.get_id(val) elif isinstance(val, list): data[prp.key] = list(self.get_id(i) for i in val) elif isinstance(val, dict): data[prp.key] = \ dict((k, self.get_id(v)) for k, v in val.items()) else: raise RuntimeError("Unknown SQLAlchemy relationship type: %s" % type(val)) return data def safe_get_file(self, digest, path, descr_path=None): """Get file from FileCacher ensuring that the digest is correct. digest (string): the digest of the file to retrieve. path (string): the path where to save the file. descr_path (string): the path where to save the description. return (bool): True if all ok, False if something wrong. """ # TODO - Probably this method could be merged in FileCacher # First get the file try: self.file_cacher.get_file_to_path(digest, path) except Exception: logger.error("File %s could not retrieved from file server.", digest, exc_info=True) return False # Then check the digest calc_digest = path_digest(path) if digest != calc_digest: logger.critical("File %s has wrong hash %s.", digest, calc_digest) return False # If applicable, retrieve also the description if descr_path is not None: with open(descr_path, 'wt', encoding='utf-8') as fout: fout.write(self.file_cacher.describe(digest)) return True
class DumpExporter(object): """This service exports every data that CMS knows. The process of exporting and importing again should be idempotent. """ def __init__(self, contest_ids, export_target, dump_files, dump_model, skip_generated, skip_submissions, skip_user_tests): if contest_ids is None: with SessionGen() as session: contests = session.query(Contest).all() self.contests_ids = [contest.id for contest in contests] users = session.query(User).all() self.users_ids = [user.id for user in users] tasks = session.query(Task)\ .filter(Task.contest_id.is_(None)).all() self.tasks_ids = [task.id for task in tasks] else: # FIXME: this is ATM broken, because if you export a contest, you # then export the users who participated in it and then all of the # contests those users participated in. self.contests_ids = contest_ids self.users_ids = [] self.tasks_ids = [] self.dump_files = dump_files self.dump_model = dump_model self.skip_generated = skip_generated self.skip_submissions = skip_submissions self.skip_user_tests = skip_user_tests self.export_target = export_target # If target is not provided, we use the contest's name. if export_target == "": self.export_target = "dump_%s.tar.gz" % date.today().isoformat() logger.warning("export_target not given, using \"%s\"", self.export_target) self.file_cacher = FileCacher() def do_export(self): """Run the actual export code.""" logger.info("Starting export.") export_dir = self.export_target archive_info = get_archive_info(self.export_target) if archive_info["write_mode"] != "": # We are able to write to this archive. if os.path.exists(self.export_target): logger.critical("The specified file already exists, " "I won't overwrite it.") return False export_dir = os.path.join(tempfile.mkdtemp(), archive_info["basename"]) logger.info("Creating dir structure.") try: os.mkdir(export_dir) except OSError: logger.critical("The specified directory already exists, " "I won't overwrite it.") return False files_dir = os.path.join(export_dir, "files") descr_dir = os.path.join(export_dir, "descriptions") os.mkdir(files_dir) os.mkdir(descr_dir) with SessionGen() as session: # Export files. logger.info("Exporting files.") if self.dump_files: for contest_id in self.contests_ids: contest = Contest.get_from_id(contest_id, session) files = contest.enumerate_files(self.skip_submissions, self.skip_user_tests, self.skip_generated) for file_ in files: if not self.safe_get_file(file_, os.path.join(files_dir, file_), os.path.join(descr_dir, file_)): return False # Export data in JSON format. if self.dump_model: logger.info("Exporting data to a JSON file.") # We use strings because they'll be the keys of a JSON # object self.ids = {} self.queue = [] data = dict() for cls, lst in [(Contest, self.contests_ids), (User, self.users_ids), (Task, self.tasks_ids)]: for i in lst: obj = cls.get_from_id(i, session) self.get_id(obj) # Specify the "root" of the data graph data["_objects"] = self.ids.values() while len(self.queue) > 0: obj = self.queue.pop(0) data[self.ids[obj.sa_identity_key]] = \ self.export_object(obj) data["_version"] = model_version with io.open(os.path.join(export_dir, "contest.json"), "wb") as fout: json.dump(data, fout, encoding="utf-8", indent=4, sort_keys=True) # If the admin requested export to file, we do that. if archive_info["write_mode"] != "": archive = tarfile.open(self.export_target, archive_info["write_mode"]) archive.add(export_dir, arcname=archive_info["basename"]) archive.close() rmtree(export_dir) logger.info("Export finished.") return True def get_id(self, obj): obj_key = obj.sa_identity_key if obj_key not in self.ids: # We use strings because they'll be the keys of a JSON object self.ids[obj_key] = "%d" % len(self.ids) self.queue.append(obj) return self.ids[obj_key] def export_object(self, obj): """Export the given object, returning a JSON-encodable dict. The returned dict will contain a "_class" item (the name of the class of the given object), an item for each column property (with a value properly translated to a JSON-compatible type) and an item for each relationship property (which will be an ID or a collection of IDs). The IDs used in the exported dict aren't related to the ones used in the DB: they are newly generated and their scope is limited to the exported file only. They are shared among all classes (that is, two objects can never share the same ID, even if they are of different classes). If, when exporting the relationship, we find an object without an ID we generate a new ID, assign it to the object and append the object to the queue of objects to export. The self.skip_submissions flag controls wheter we export submissions (and all other objects that can be reached only by passing through a submission) or not. """ cls = type(obj) data = {"_class": cls.__name__} for prp in cls._col_props: col, = prp.columns col_type = type(col.type) val = getattr(obj, prp.key) if col_type in \ [Boolean, Integer, Float, Unicode, RepeatedUnicode, Enum]: data[prp.key] = val elif col_type is String: data[prp.key] = \ val.decode('latin1') if val is not None else None elif col_type is DateTime: data[prp.key] = \ make_timestamp(val) if val is not None else None elif col_type is Interval: data[prp.key] = \ val.total_seconds() if val is not None else None else: raise RuntimeError("Unknown SQLAlchemy column type: %s" % col_type) for prp in cls._rel_props: other_cls = prp.mapper.class_ # Skip submissions if requested if self.skip_submissions and other_cls is Submission: continue # Skip user_tests if requested if self.skip_user_tests and other_cls is UserTest: continue # Skip generated data if requested if self.skip_generated and other_cls in (SubmissionResult, UserTestResult): continue val = getattr(obj, prp.key) if val is None: data[prp.key] = None elif isinstance(val, other_cls): data[prp.key] = self.get_id(val) elif isinstance(val, list): data[prp.key] = list(self.get_id(i) for i in val) elif isinstance(val, dict): data[prp.key] = \ dict((k, self.get_id(v)) for k, v in val.iteritems()) else: raise RuntimeError("Unknown SQLAlchemy relationship type: %s" % type(val)) return data def safe_get_file(self, digest, path, descr_path=None): """Get file from FileCacher ensuring that the digest is correct. digest (string): the digest of the file to retrieve. path (string): the path where to save the file. descr_path (string): the path where to save the description. return (bool): True if all ok, False if something wrong. """ # TODO - Probably this method could be merged in FileCacher # First get the file try: self.file_cacher.get_file_to_path(digest, path) except Exception: logger.error("File %s could not retrieved from file server.", digest, exc_info=True) return False # Then check the digest calc_digest = sha1sum(path) if digest != calc_digest: logger.critical("File %s has wrong hash %s.", digest, calc_digest) return False # If applicable, retrieve also the description if descr_path is not None: with io.open(descr_path, 'wt', encoding='utf-8') as fout: fout.write(self.file_cacher.describe(digest)) return True
class ContestExporter(object): """This service exports every data about the contest that CMS knows. The process of exporting and importing again should be idempotent. """ def __init__(self, contest_id, export_target, dump_files, dump_model, skip_generated, skip_submissions, skip_user_tests): self.contest_id = contest_id self.dump_files = dump_files self.dump_model = dump_model self.skip_generated = skip_generated self.skip_submissions = skip_submissions self.skip_user_tests = skip_user_tests # If target is not provided, we use the contest's name. if export_target == "": with SessionGen() as session: contest = Contest.get_from_id(self.contest_id, session) self.export_target = "dump_%s.tar.gz" % contest.name logger.warning("export_target not given, using \"%s\"" % self.export_target) else: self.export_target = export_target self.file_cacher = FileCacher() def do_export(self): """Run the actual export code.""" logger.info("Starting export.") export_dir = self.export_target archive_info = get_archive_info(self.export_target) if archive_info["write_mode"] != "": # We are able to write to this archive. if os.path.exists(self.export_target): logger.critical("The specified file already exists, " "I won't overwrite it.") return False export_dir = os.path.join(tempfile.mkdtemp(), archive_info["basename"]) logger.info("Creating dir structure.") try: os.mkdir(export_dir) except OSError: logger.critical("The specified directory already exists, " "I won't overwrite it.") return False files_dir = os.path.join(export_dir, "files") descr_dir = os.path.join(export_dir, "descriptions") os.mkdir(files_dir) os.mkdir(descr_dir) with SessionGen() as session: contest = Contest.get_from_id(self.contest_id, session) # Export files. if self.dump_files: logger.info("Exporting files.") files = contest.enumerate_files(self.skip_submissions, self.skip_user_tests, self.skip_generated) for file_ in files: if not self.safe_get_file(file_, os.path.join(files_dir, file_), os.path.join(descr_dir, file_)): return False # Export the contest in JSON format. if self.dump_model: logger.info("Exporting the contest to a JSON file.") # We use strings because they'll be the keys of a JSON # object; the contest will have ID 0. self.ids = {contest.sa_identity_key: "0"} self.queue = [contest] data = dict() while len(self.queue) > 0: obj = self.queue.pop(0) data[self.ids[obj.sa_identity_key]] = self.export_object(obj) # Specify the "root" of the data graph data["_objects"] = ["0"] data["_version"] = model_version with io.open(os.path.join(export_dir, "contest.json"), "wb") as fout: json.dump(data, fout, encoding="utf-8", indent=4, sort_keys=True) # If the admin requested export to file, we do that. if archive_info["write_mode"] != "": archive = tarfile.open(self.export_target, archive_info["write_mode"]) archive.add(export_dir, arcname=archive_info["basename"]) archive.close() rmtree(export_dir) logger.info("Export finished.") return True def get_id(self, obj): obj_key = obj.sa_identity_key if obj_key not in self.ids: # We use strings because they'll be the keys of a JSON object self.ids[obj_key] = str(len(self.ids)) self.queue.append(obj) return self.ids[obj_key] def export_object(self, obj): """Export the given object, returning a JSON-encodable dict. The returned dict will contain a "_class" item (the name of the class of the given object), an item for each column property (with a value properly translated to a JSON-compatible type) and an item for each relationship property (which will be an ID or a collection of IDs). The IDs used in the exported dict aren't related to the ones used in the DB: they are newly generated and their scope is limited to the exported file only. They are shared among all classes (that is, two objects can never share the same ID, even if they are of different classes). If, when exporting the relationship, we find an object without an ID we generate a new ID, assign it to the object and append the object to the queue of objects to export. The self.skip_submissions flag controls wheter we export submissions (and all other objects that can be reached only by passing through a submission) or not. """ cls = type(obj) data = {"_class": cls.__name__} for prp in cls._col_props: col, = prp.columns col_type = type(col.type) val = getattr(obj, prp.key) if col_type in [Boolean, Integer, Float, Unicode, RepeatedUnicode]: data[prp.key] = val elif col_type is String: data[prp.key] = \ val.decode('latin1') if val is not None else None elif col_type is DateTime: data[prp.key] = \ make_timestamp(val) if val is not None else None elif col_type is Interval: data[prp.key] = \ val.total_seconds() if val is not None else None else: raise RuntimeError("Unknown SQLAlchemy column type: %s" % col_type) for prp in cls._rel_props: other_cls = prp.mapper.class_ # Skip submissions if requested if self.skip_submissions and other_cls is Submission: continue # Skip user_tests if requested if self.skip_user_tests and other_cls is UserTest: continue # Skip generated data if requested if self.skip_generated and other_cls in (SubmissionResult, UserTestResult): continue val = getattr(obj, prp.key) if val is None: data[prp.key] = None elif isinstance(val, other_cls): data[prp.key] = self.get_id(val) elif isinstance(val, list): data[prp.key] = list(self.get_id(i) for i in val) elif isinstance(val, dict): data[prp.key] = \ dict((k, self.get_id(v)) for k, v in val.iteritems()) else: raise RuntimeError("Unknown SQLAlchemy relationship type: %s" % type(val)) return data def safe_get_file(self, digest, path, descr_path=None): """Get file from FileCacher ensuring that the digest is correct. digest (string): the digest of the file to retrieve. path (string): the path where to save the file. descr_path (string): the path where to save the description. return (bool): True if all ok, False if something wrong. """ # TODO - Probably this method could be merged in FileCacher # First get the file try: self.file_cacher.get_file_to_path(digest, path) except Exception as error: logger.error("File %s could not retrieved from file server (%r)." % (digest, error)) return False # Then check the digest calc_digest = sha1sum(path) if digest != calc_digest: logger.critical("File %s has wrong hash %s." % (digest, calc_digest)) return False # If applicable, retrieve also the description if descr_path is not None: with io.open(descr_path, 'wt', encoding='utf-8') as fout: fout.write(self.file_cacher.describe(digest)) return True