def __init__(self, config, tracker=None, relative=True): """Construct validator instance. Args: config (dict): School config dictionary. tracker (None, optional): Description relative (bool, optional): Enforce relative ordering in validation. """ Validator.load_schemas() self.kind_to_validation_function = { kind: getattr(self, 'validate_' + kind) if hasattr(self, 'validate_' + kind) else lambda *_, **__: None for kind in Validator.KINDS } # Running monitor of validated course and section codes. self.seen = {} self.config = DotDict(config) self.config['kind'] = 'config' self.validate(self.config) self.course_code_regex = re.compile(self.config.course_code_regex) self.relative = relative if tracker is None: # Used during self-contained validation. self.tracker = Tracker() self.tracker.school = self.config.school.code self.tracker.mode = 'validating' self.tracker.start() else: self.tracker = tracker
def handle(self, *args, **options): tracker = Tracker() tracker.add_viewer(LogFormatted(options['master_log'])) tracker.cmd_options = options tracker.mode = 'digesting' tracker.start() for data_type in options['types']: for school in options['schools']: self.run(tracker, school, data_type, options) tracker.end()
def handle(self, *args, **options): """Logic of the command. Args: *args: Args of command. **options: Command options. """ tracker = Tracker() self.stat_view = StatView() tracker.add_viewer(self.stat_view) tracker.mode = 'digesting' tracker.start() for data_type in options['types']: for school in options['schools']: self.run(tracker, school, data_type, options) tracker.end()
def handle(self, *args, **options): """Logic of the command. Args: *args: Args of command. **options: Command options. """ tracker = Tracker() tracker.mode = 'validating' if options['display_progress_bar']: tracker.add_viewer(StatProgressBar('{valid}/{total}')) tracker.start() for parser_type in options['types']: for school in options['schools']: self.run(options, school, parser_type, tracker)
def handle(self, *args, **options): """Logic of the command. Args: *args: Args of command. **options: Command options. """ tracker = Tracker() tracker.mode = 'ingesting' self.stat_view = StatView() tracker.add_viewer(self.stat_view) if options['display_progress_bar']: tracker.add_viewer( StatProgressBar('{valid}/{total}', statistics=self.stat_view)) tracker.start() for parser_type in options['types']: for school in options['schools']: tracker.school = school self.run(SCHOOLS_MAP[school].parsers[parser_type], tracker, options, parser_type, school) tracker.end()
class Validator: """Validation engine in parsing data pipeline. Attributes: config (:obj:`DotDict`): Loaded config.json. course_code_regex (:obj:`re`): Regex to match course code. kind_to_validation_function (:obj:`dict`): Map kind to validation function defined within this class. KINDS (:obj:`set`): Kinds of objects that validator validates. relative (:obj:`bool`): Enforce relative ordering in validation. seen (:obj:`dict`): Running monitor of seen courses and sections tracker (:obj:`parsing.library.tracker.Tracker`) """ KINDS = { 'config', 'datalist', 'course', 'section', 'meeting', 'directory', 'eval', 'instructor', 'final_exam', 'textbook', 'textbook_link', } def __init__(self, config, tracker=None, relative=True): """Construct validator instance. Args: config (dict): School config dictionary. tracker (None, optional): Description relative (bool, optional): Enforce relative ordering in validation. """ Validator.load_schemas() self.kind_to_validation_function = { kind: getattr(self, 'validate_' + kind) if hasattr(self, 'validate_' + kind) else lambda *_, **__: None for kind in Validator.KINDS } # Running monitor of validated course and section codes. self.seen = {} self.config = DotDict(config) self.config['kind'] = 'config' self.validate(self.config) self.course_code_regex = re.compile(self.config.course_code_regex) self.relative = relative if tracker is None: # Used during self-contained validation. self.tracker = Tracker() self.tracker.school = self.config.school.code self.tracker.mode = 'validating' self.tracker.start() else: self.tracker = tracker @classmethod def load_schemas(cls, schema_path=None): """Load JSON validation schemas. NOTE: Will load schemas as static variable (i.e. once per definition), unless schema_path is specifically defined. Args: schema_path (None, str, optional): Override default schema_path """ if hasattr(cls, 'SCHEMAS') and schema_path is None: return if schema_path is None: schema_path = '{}/{}/library/schemas'.format( settings.BASE_DIR, settings.PARSING_MODULE) def load(kind): filepath = '{}/{}.json'.format(schema_path, kind) with open(filepath, 'r') as file: schema = json.load(file) resolved = jsonschema.RefResolver('file://{}/'.format(schema_path), schema) return (schema, resolved) cls.SCHEMAS = DotDict({kind: load(kind) for kind in cls.KINDS}) # TODO - make into a namedtuple instead @staticmethod def schema_validate(data, schema, resolver=None): """Validate data object with JSON schema alone. Args: data (dict): Data object to validate. schema: JSON schema to validate against. resolver (None, optional): JSON Schema reference resolution. Raises: jsonschema.exceptions.ValidationError: Invalid object. """ try: jsonschema.Draft4Validator(schema, resolver=resolver).validate(data) except jsonschema.exceptions.ValidationError as e: raise ValidationError(data, *e.args) # TODO - Create iter_errors from jsonschema validator # NOTE: if modifying schemas it may be prudent to catch: # jsonschema.exceptions.SchemaError # jsonschema.exceptions.RefResolutionError @staticmethod def file_to_json(path, allow_duplicates=False): """Load file pointed to by path into json object dictionary. Args: path (str): allow_duplicates (bool, optional): Allow duplicate keys in JSON. Returns: dict: JSON-compliant dictionary. """ def raise_on_duplicates(ordered_pairs): """Reject duplicate keys in dictionary.""" d = {} for k, v in ordered_pairs: if k in d: raise ValidationError("duplicate key: %r" % (k, )) d[k] = v return d with open(path, 'r') as f: if allow_duplicates: return json.load(f) return json.load(f, object_pairs_hook=raise_on_duplicates) def validate(self, data, transact=True): """Validation entry/dispatcher. Args: data (list, dict): Data to validate. """ if transact: self.transaction = SimpleNamespace(key=None, values=set()) data = DotDict(data) Validator.schema_validate(data, *Validator.SCHEMAS[data.kind]) self.kind_to_validation_function[data.kind](data) if transact and self.transaction.key: self.seen.setdefault(self.transaction.key, set()).update(self.transaction.values) def validate_self_contained(self, data_path, break_on_error=True, break_on_warning=False, output_error=None, display_progress_bar=True, master_log_path=None): """Validate JSON file as without ingestor. Args: data_path (str): Path to data file. break_on_error (bool, optional): Description break_on_warning (bool, optional): Description output_error (None, optional): Error output file path. display_progress_bar (bool, optional): Description master_log_path (None, optional): Description break_on_error (bool, optional) break_on_warning (bool, optional) display_progress_bar (bool, optional) Raises: ValidationError: Description """ data = Validator.file_to_json(data_path)['$data'] # Validator.schema_validate(data, *Validator.SCHEMAS.datalist) for obj in map(DotDict, data): try: self.validate(obj) self.tracker.stats = dict(kind=obj.kind, status='valid') except ValidationError as e: logging.exception('Validation error') if break_on_error: raise ValidationError(*e.args) except ValidationWarning as e: logging.warn(e) # warnings.warn('', e, stacklevel=2) self.tracker.stats = dict(kind=obj.kind, status='total') # TODO - this should be handled by caller self.tracker.end() def validate_course(self, course): """Validate course. Args: course (DotDict): Course object to validate. Raises: MultipleDefinitionsWarning: Course has already been validated in same session. ValidationError: Invalid course. """ if 'kind' in course and course.kind != 'course': raise ValidationError(course, 'course object must be of kind course') if ('school' in course and course.school.code != self.config.school.code): raise ValidationError(course, 'course schools does not match config') if self.course_code_regex.match(course.code) is None: raise ValidationError( course, "course code {} does not match r'{}'".format( course.code, self.config.course_code_regex)) if ('department' in course and 'code' in course.department and 'departments' in self.config): department_codes = {d.code for d in self.config.departments} if course.department.code not in department_codes: raise ValidationError( course, 'department {} is not in config.json departments'.format( course.department)) if 'homepage' in course: self.validate_website(course.homepage) for sa in course.get('same_as', []): if self.course_code_regex.match(sa) is not None: continue # raise ValidationError( # course, # "same as course code {} does not match r'{}'".format( # course.code, # self.config.course_code_regex # ) # ) if self.relative: if course.code in self.seen: raise MultipleDefinitionsWarning( course, 'multiple definitions of course {}'.format(course.code)) self.transaction.key = course.code for section in course.get('sections', []): if ('course' in section and section['course']['code'] != course.code): raise ValidationError( course, 'nested {} does not match parent {}'.format( section['course']['code'], course.code)) # NOTE: mutating dictionary section['course'] = {'code': course.code} section['kind'] = 'section' self.validate(DotDict(section), transact=False) def validate_section(self, section): """Validate section object. Args: section (DotDict): Section object to validate. Raises: MultipleDefinitionsWarning: Invalid section. ValidationError: Description """ if 'course' not in section: raise ValidationError(section, 'section doesnt define a parent course') if 'kind' in section and section.kind != 'section': raise ValidationError(section, 'section must be of kind section') if ('course' in section and self.course_code_regex.match(section.course.code) is None): raise ValidationError( section, 'course code {} does not match r\'{}\''.format( section.course.code, self.config.course_code_regex)) if 'term' in section and section.term not in self.config.terms: raise ValidationError( section, 'term {} not in config.json term list'.format(section.term)) if 'instructors' in section: db_instructor_textfield_max_size = 500 instructor_textfield = '' for instructor in section.get('instructors', []): instructor = DotDict(instructor) if isinstance(instructor.name, basestring): instructor_textfield += instructor.name elif isinstance(instructor.name, dict): instructor_textfield += '{} {}'.format( instructor.name.first, instructor.name.last) db_instructor_textfield_size = len(instructor_textfield) if db_instructor_textfield_size > db_instructor_textfield_max_size: raise ValidationError( section, 'db field too small for comma-joined instructor names') for instructor in section.get('instructors', []): self.validate_instructor(instructor) if 'final_exam' in section: if ('course' in section.final_exam and section.final_exam.course.code != section.course.code): raise ValidationError( section, 'final exam course {} doesnt match course code {}'.format( section.final_exam.course.code, section.course.code)) if ('section' in section.final_exam and section.final_exam.section.code != section.code): raise ValidationError( section, 'final exam section {} doesnt match section {}'.format( section.final_exam.section.code, section.code)) # final_exam['course'] = section.course # final_exam['section'] = {'code': section.code} # self.validate_final_exam(section.final_exam) if self.relative: if section.course.code not in self.seen and self.transaction.key != section.course.code: print(self.seen) raise ValidationError( 'course code {} isnt defined'.format(section.course.code), section) elif ((section.code, section.year, section.term) in self.seen.get(section.course.code, set()) | self.transaction.values): raise MultipleDefinitionsWarning( section, 'multiple defs for {} {} - {} already defined'.format( section.course.code, section.code, section.year)) self.transaction.key = section.course.code self.transaction.values.add( (section.code, section.year, section.term)) for meeting in section.get('meetings', []): meeting = DotDict(meeting) if ('course' in meeting and meeting.course.code != section.course.code): raise ValidationError( section, 'course code {} in meeting doesnt match parent section \ course code {}'.format(meeting.course.code, section.course.code)) if 'section' in meeting and meeting.section.code != section.code: raise ValidationError( section, 'section code {} in nested meeting doesnt match parent \ section code {}'.format(meeting.section.code, section.code)) # NOTE: mutating obj meeting['course'] = section.course meeting['section'] = { 'code': section.code, 'year': section.year, 'term': section.term } meeting['kind'] = 'meeting' self.validate(DotDict(meeting), transact=False) if 'textbooks' in section: for textbook in section.textbooks: self.validate_textbook_link(textbook) def validate_meeting(self, meeting): """Validate meeting object. Args: meeting (DotDict): Meeting object to validate. Raises: ValidationError: Invalid meeting. ValidationWarning: Description """ if 'kind' in meeting and meeting.kind != 'meeting': raise ValidationError(meeting, 'meeting object must be kind instructor') if ('course' in meeting and self.course_code_regex.match(meeting.course.code) is None): raise ValidationError( meeting, 'course code {} does not match regex \'{}\''.format( meeting.course.code, self.config.course_code_regex)) if 'time' in meeting: try: self.validate_time_range(meeting.time.start, meeting.time.end) except (ValidationError, ValidationWarning) as e: message = 'meeting for {} {}, '.format(meeting.course.code, meeting.section.code) if isinstance(e, ValidationError): raise ValidationError(message, *e.args) raise ValidationWarning(message, *e.args) if 'location' in meeting: try: self.validate_location(meeting.location) except ValidationError as e: message = 'meeting for {} {}, '.format(meeting.course.code, meeting.section.code) raise ValidationError(message, *e.args) if not self.relative: return if 'course' in meeting and meeting.course.code not in self.seen and self.transaction is None: raise ValidationError( meeting, 'course code {} isnt defined'.format(meeting.course.code)) if 'section' not in meeting: return if (meeting.section.code, meeting.section.year, meeting.section.term) not in self.seen.get( meeting.course.code, set()) | self.transaction.values: raise ValidationError( meeting, 'section {} isnt defined'.format(meeting.section.code)) def validate_eval(self, course_eval): """Validate evaluation object. Args: course_eval (DotDict): Evaluation to validate. Raises: ValidationError: Invalid evaulation. """ if self.course_code_regex.match(course_eval.course.code) is None: raise ValidationError( course_eval, "course code {} does not match r'{}'".format( course_eval.course.code, self.config.course_code_regex)) def validate_instructor(self, instructor): """Validate instructor object. Args: instructor (DotDict): Instructor object to validate. Raises: ValidationError: Invalid instructor. """ if 'kind' in instructor and instructor.kind != 'instructor': raise ValidationError( instructor, 'instructor object must be of kind instructor') for class_ in instructor.get('classes', []): if ('course' in class_ and self.course_code_regex.match(class_.course.code) is None): raise ValidationError( instructor, 'course code {} does not match given regex {}'.format( class_.course.code, self.config.course_code_regex)) if 'department' in instructor and 'departments' in self.config: dept_codes = {d.code for d in self.config.departments} if instructor.department not in dept_codes: raise ValidationError( instructor, 'department {} not listed in config.json'.format( instructor.department)) if 'homepage' in instructor: try: self.validate_homepage(instructor.homepage) except ValidationError as e: message = 'instructor {} office, {}'.format(instructor.name) raise ValidationError(message, *e.args) if 'office' in instructor: try: if 'location' in instructor.office: self.validate_location(instructor.office.location) for office_hour in instructor.office.get('hours', []): self.validate_meeting(office_hour) except ValidationError as e: message = 'instructor {} office, {}'.format(instructor.name) raise ValidationError(message, *e.args) def validate_final_exam(self, final_exam): """Validate final exam. NOTE: currently unused. Args: final_exam (DotDict): Final Exam object to validate. Raises: ValidationError: Invalid final exam. """ if 'kind' in final_exam and final_exam.kind != 'final_exam': raise ValidationError( final_exam, 'final_exam object must be of kind "final_exam"') try: self.validate_meeting(final_exam.meeting) except ValidationError as e: raise ValidationError(final_exam, *e.args) def validate_textbook_link(self, textbook_link): """Validate textbook link. Args: textbook_link (DotDict): Textbook link object to validate. Raises: ValidationError: Invalid textbook link. """ if 'course' not in textbook_link: return if self.course_code_regex.match(textbook_link.course.code) is not None: return raise ValidationError( textbook_link, 'textbook_link course code doent match course code regex') def validate_location(self, location): """Validate location. Args: location (DotDict): Location object to validate. Raises: ValidationWarning: Invalid location. """ if 'campus' in location and 'campuses' in self.config: if location.campus not in self.config.campuses: raise ValidationWarning( location, 'campus {} not in config'.format(location.campus), ) if 'building' in location and 'buildings' in self.config: if location.building not in self.config.buildings: raise ValidationWarning( location, 'building {} not in config'.format(location.building), ) @staticmethod def validate_website(url): """Validate url by sending HEAD request and analyzing response. Args: url (str): URL to validate. Raises: ValidationError: URL is invalid. """ c = httplib.HTTPConnection(url) c.request('HEAD', '') # NOTE: 200 - good status # 301 - redirected if c.getresponse().status == 200 or c.getresponse().status == 301: return raise ValidationError(url, 'invalid website w/url "%s"'.format(url)) def validate_time_range(self, start, end): """Validate start time and end time. There exists an unhandled case if the end time is midnight. Args: start (str): Start time. end (str): End time. Raises: ValidationError: Time range is invalid. """ try: start, end = map(dparser.parse, [start, end]) except ValueError: raise ValidationError('invalid time format {}-{}'.format( start, end)) if start > end: raise ValidationError('start {} > end {}'.format(start, end)) elif start == end: pass # TODO - this should be reported # raise ValidationWarning('start {} = end {}'.format(start, end)) # NOTE: there exists an unhandled case if the end time is midnight. def validate_directory(self, directory): """Validate directory. Args: directory (str, dict): Directory to validate. May be either path or object. Raises: ValidationError: encapsulated IOError """ if isinstance(directory, str): try: name = directory directory = dir_to_dict(directory) directory['name'] = name except IOError as e: raise ValidationError(str(e)) Validator.schema_validate(directory, *Validator.SCHEMAS.directory)
def handle(self, *args, **options): """Logic of the command. Args: *args: Args of command. **options: Command options. """ tracker = Tracker() tracker.mode = "ingesting" self.stat_view = StatView() tracker.add_viewer(self.stat_view) if options["display_progress_bar"]: tracker.add_viewer( StatProgressBar("{valid}/{total}", statistics=self.stat_view) ) tracker.start() for parser_type in options["types"]: for school in options["schools"]: tracker.school = school try: parsing = __import__( "parsing.schools.{school}.{parser_type}".format( school=school, parser_type=parser_type ) ) parser = eval( "parsing.schools.{school}.{parser_type}.Parser".format( school=school, parser_type=parser_type ) ) self.run(parser, tracker, options, parser_type, school) except ImportError: logging.exception("Invalid parser") continue tracker.end()
def handle(self, *args, **options): """Logic of the command. Args: *args: Args of command. **options: Command options. """ tracker = Tracker() tracker.cmd_options = options tracker.add_viewer(LogFormatted(options['master_log'])) tracker.mode = 'ingesting' if options['display_progress_bar']: tracker.add_viewer(ProgressBar('{valid}/{total}')) tracker.start() for data_type in options['types']: for school in options['schools']: tracker.school = school # TODO - remove after deprecation if school not in parsers[data_type]: old_map = { 'textbooks': textbook_parsers, 'courses': course_parsers, 'evals': eval_parsers, } self.old_parser(old_map[data_type][school], school) continue # END - remove after deprecation self.run(parsers[data_type][school], tracker, options, data_type, school) tracker.end()
def handle(self, *args, **options): """Logic of the command. Args: *args: Args of command. **options: Command options. """ tracker = Tracker() tracker.mode = 'ingesting' self.stat_view = StatView() tracker.add_viewer(self.stat_view) if options['display_progress_bar']: tracker.add_viewer(StatProgressBar('{valid}/{total}', statistics=self.stat_view)) tracker.start() for parser_type in options['types']: for school in options['schools']: tracker.school = school self.run(SCHOOLS_MAP[school].parsers[parser_type], tracker, options, parser_type, school) tracker.end()
class Validator: """Validation engine in parsing data pipeline. Attributes: config (:obj:`DotDict`): Loaded config.json. course_code_regex (:obj:`re`): Regex to match course code. kind_to_validation_function (:obj:`dict`): Map kind to validation function defined within this class. KINDS (:obj:`set`): Kinds of objects that validator validates. relative (:obj:`bool`): Enforce relative ordering in validation. seen (:obj:`dict`): Running monitor of seen courses and sections tracker (:obj:`parsing.library.tracker.Tracker`) """ KINDS = { 'config', 'datalist', 'course', 'section', 'meeting', 'directory', 'eval', 'instructor', 'final_exam', 'textbook', 'textbook_link', } def __init__(self, config, tracker=None, relative=True): """Construct validator instance. Args: config (dict): School config dictionary. tracker (None, optional): Description relative (bool, optional): Enforce relative ordering in validation. """ Validator.load_schemas() self.kind_to_validation_function = { kind: getattr(self, 'validate_' + kind) if hasattr(self, 'validate_' + kind) else lambda *_, **__: None for kind in Validator.KINDS } # Running monitor of validated course and section codes. self.seen = {} self.config = DotDict(config) self.config['kind'] = 'config' self.validate(self.config) self.course_code_regex = re.compile(self.config.course_code_regex) self.relative = relative if tracker is None: # Used during self-contained validation. self.tracker = Tracker() self.tracker.school = self.config.school.code self.tracker.mode = 'validating' self.tracker.start() else: self.tracker = tracker @classmethod def load_schemas(cls, schema_path=None): """Load JSON validation schemas. NOTE: Will load schemas as static variable (i.e. once per definition), unless schema_path is specifically defined. Args: schema_path (None, str, optional): Override default schema_path """ if hasattr(cls, 'SCHEMAS') and schema_path is None: return if schema_path is None: schema_path = '{}/{}/library/schemas'.format( settings.BASE_DIR, settings.PARSING_MODULE ) def load(kind): filepath = '{}/{}.json'.format(schema_path, kind) with open(filepath, 'r') as file: schema = json.load(file) resolved = jsonschema.RefResolver( 'file://{}/'.format(schema_path), schema ) return (schema, resolved) cls.SCHEMAS = DotDict({ kind: load(kind) for kind in cls.KINDS }) # TODO - make into a namedtuple instead @staticmethod def schema_validate(data, schema, resolver=None): """Validate data object with JSON schema alone. Args: data (dict): Data object to validate. schema: JSON schema to validate against. resolver (None, optional): JSON Schema reference resolution. Raises: jsonschema.exceptions.ValidationError: Invalid object. """ try: jsonschema.Draft4Validator(schema, resolver=resolver).validate(data) except jsonschema.exceptions.ValidationError as e: raise ValidationError(data, *e.args) # TODO - Create iter_errors from jsonschema validator # NOTE: if modifying schemas it may be prudent to catch: # jsonschema.exceptions.SchemaError # jsonschema.exceptions.RefResolutionError @staticmethod def file_to_json(path, allow_duplicates=False): """Load file pointed to by path into json object dictionary. Args: path (str): allow_duplicates (bool, optional): Allow duplicate keys in JSON. Returns: dict: JSON-compliant dictionary. """ def raise_on_duplicates(ordered_pairs): """Reject duplicate keys in dictionary.""" d = {} for k, v in ordered_pairs: if k in d: raise ValidationError("duplicate key: %r" % (k,)) d[k] = v return d with open(path, 'r') as f: if allow_duplicates: return json.load(f) return json.load(f, object_pairs_hook=raise_on_duplicates) def validate(self, data, transact=True): """Validation entry/dispatcher. Args: data (list, dict): Data to validate. """ if transact: self.transaction = SimpleNamespace(key=None, values=set()) data = DotDict(data) Validator.schema_validate(data, *Validator.SCHEMAS[data.kind]) self.kind_to_validation_function[data.kind](data) if transact and self.transaction.key: self.seen.setdefault(self.transaction.key, set()).update(self.transaction.values) def validate_self_contained(self, data_path, break_on_error=True, break_on_warning=False, output_error=None, display_progress_bar=True, master_log_path=None): """Validate JSON file as without ingestor. Args: data_path (str): Path to data file. break_on_error (bool, optional): Description break_on_warning (bool, optional): Description output_error (None, optional): Error output file path. display_progress_bar (bool, optional): Description master_log_path (None, optional): Description break_on_error (bool, optional) break_on_warning (bool, optional) display_progress_bar (bool, optional) Raises: ValidationError: Description """ data = Validator.file_to_json(data_path)['$data'] # Validator.schema_validate(data, *Validator.SCHEMAS.datalist) for obj in map(DotDict, data): try: self.validate(obj) self.tracker.stats = dict(kind=obj.kind, status='valid') except ValidationError as e: logging.exception('Validation error') if break_on_error: raise ValidationError(*e.args) except ValidationWarning as e: logging.warn(e) # warnings.warn('', e, stacklevel=2) self.tracker.stats = dict(kind=obj.kind, status='total') # TODO - this should be handled by caller self.tracker.end() def validate_course(self, course): """Validate course. Args: course (DotDict): Course object to validate. Raises: MultipleDefinitionsWarning: Course has already been validated in same session. ValidationError: Invalid course. """ if 'kind' in course and course.kind != 'course': raise ValidationError(course, 'course object must be of kind course') if ('school' in course and course.school.code != self.config.school.code): raise ValidationError(course, 'course schools does not match config') if self.course_code_regex.match(course.code) is None: raise ValidationError( course, "course code {} does not match r'{}'".format( course.code, self.config.course_code_regex ) ) if ('department' in course and 'code' in course.department and 'departments' in self.config): department_codes = {d.code for d in self.config.departments} if course.department.code not in department_codes: raise ValidationError( course, 'department {} is not in config.json departments'.format( course.department) ) if 'homepage' in course: self.validate_website(course.homepage) for sa in course.get('same_as', []): if self.course_code_regex.match(sa) is not None: continue # raise ValidationError( # course, # "same as course code {} does not match r'{}'".format( # course.code, # self.config.course_code_regex # ) # ) if self.relative: if course.code in self.seen: raise MultipleDefinitionsWarning( course, 'multiple definitions of course {}'.format(course.code) ) self.transaction.key = course.code for section in course.get('sections', []): if ('course' in section and section['course']['code'] != course.code): raise ValidationError( course, 'nested {} does not match parent {}'.format( section['course']['code'], course.code ) ) # NOTE: mutating dictionary section['course'] = {'code': course.code} section['kind'] = 'section' self.validate(DotDict(section), transact=False) def validate_section(self, section): """Validate section object. Args: section (DotDict): Section object to validate. Raises: MultipleDefinitionsWarning: Invalid section. ValidationError: Description """ if 'course' not in section: raise ValidationError(section, 'section doesnt define a parent course') if 'kind' in section and section.kind != 'section': raise ValidationError(section, 'section must be of kind section') if ('course' in section and self.course_code_regex.match(section.course.code) is None): raise ValidationError( section, 'course code {} does not match r\'{}\''.format( section.course.code, self.config.course_code_regex ) ) if 'term' in section and section.term not in self.config.terms: raise ValidationError( section, 'term {} not in config.json term list'.format(section.term) ) if 'instructors' in section: db_instructor_textfield_max_size = 500 instructor_textfield = '' for instructor in section.get('instructors', []): instructor = DotDict(instructor) if isinstance(instructor.name, basestring): instructor_textfield += instructor.name elif isinstance(instructor.name, dict): instructor_textfield += '{} {}'.format(instructor.name.first, instructor.name.last) db_instructor_textfield_size = len(instructor_textfield) if db_instructor_textfield_size > db_instructor_textfield_max_size: raise ValidationError( section, 'db field too small for comma-joined instructor names' ) for instructor in section.get('instructors', []): self.validate_instructor(instructor) if 'final_exam' in section: if ('course' in section.final_exam and section.final_exam.course.code != section.course.code): raise ValidationError( section, 'final exam course {} doesnt match course code {}'.format( section.final_exam.course.code, section.course.code ) ) if ('section' in section.final_exam and section.final_exam.section.code != section.code): raise ValidationError( section, 'final exam section {} doesnt match section {}'.format( section.final_exam.section.code, section.code ) ) # final_exam['course'] = section.course # final_exam['section'] = {'code': section.code} # self.validate_final_exam(section.final_exam) if self.relative: if section.course.code not in self.seen and self.transaction.key != section.course.code: print(self.seen) raise ValidationError( 'course code {} isnt defined'.format(section.course.code), section ) elif ((section.code, section.year, section.term) in self.seen.get(section.course.code, set()) | self.transaction.values): raise MultipleDefinitionsWarning( section, 'multiple defs for {} {} - {} already defined'.format( section.course.code, section.code, section.year ) ) self.transaction.key = section.course.code self.transaction.values.add((section.code, section.year, section.term)) for meeting in section.get('meetings', []): meeting = DotDict(meeting) if ('course' in meeting and meeting.course.code != section.course.code): raise ValidationError( section, 'course code {} in meeting doesnt match parent section \ course code {}'.format( meeting.course.code, section.course.code ) ) if 'section' in meeting and meeting.section.code != section.code: raise ValidationError( section, 'section code {} in nested meeting doesnt match parent \ section code {}'.format( meeting.section.code, section.code ) ) # NOTE: mutating obj meeting['course'] = section.course meeting['section'] = { 'code': section.code, 'year': section.year, 'term': section.term } meeting['kind'] = 'meeting' self.validate(DotDict(meeting), transact=False) if 'textbooks' in section: for textbook in section.textbooks: self.validate_textbook_link(textbook) def validate_meeting(self, meeting): """Validate meeting object. Args: meeting (DotDict): Meeting object to validate. Raises: ValidationError: Invalid meeting. ValidationWarning: Description """ if 'kind' in meeting and meeting.kind != 'meeting': raise ValidationError(meeting, 'meeting object must be kind instructor') if ('course' in meeting and self.course_code_regex.match(meeting.course.code) is None): raise ValidationError( meeting, 'course code {} does not match regex \'{}\''.format( meeting.course.code, self.config.course_code_regex ) ) if 'time' in meeting: try: self.validate_time_range(meeting.time.start, meeting.time.end) except (ValidationError, ValidationWarning) as e: message = 'meeting for {} {}, '.format( meeting.course.code, meeting.section.code ) if isinstance(e, ValidationError): raise ValidationError(message, *e.args) raise ValidationWarning(message, *e.args) if 'location' in meeting: try: self.validate_location(meeting.location) except ValidationError as e: message = 'meeting for {} {}, '.format( meeting.course.code, meeting.section.code ) raise ValidationError(message, *e.args) if not self.relative: return if 'course' in meeting and meeting.course.code not in self.seen and self.transaction is None: raise ValidationError( meeting, 'course code {} isnt defined'.format(meeting.course.code) ) if 'section' not in meeting: return if (meeting.section.code, meeting.section.year, meeting.section.term) not in self.seen.get(meeting.course.code, set()) | self.transaction.values: raise ValidationError( meeting, 'section {} isnt defined'.format(meeting.section.code) ) def validate_eval(self, course_eval): """Validate evaluation object. Args: course_eval (DotDict): Evaluation to validate. Raises: ValidationError: Invalid evaulation. """ if self.course_code_regex.match(course_eval.course.code) is None: raise ValidationError( course_eval, "course code {} does not match r'{}'".format( course_eval.course.code, self.config.course_code_regex ) ) def validate_instructor(self, instructor): """Validate instructor object. Args: instructor (DotDict): Instructor object to validate. Raises: ValidationError: Invalid instructor. """ if 'kind' in instructor and instructor.kind != 'instructor': raise ValidationError( instructor, 'instructor object must be of kind instructor' ) for class_ in instructor.get('classes', []): if ('course' in class_ and self.course_code_regex.match(class_.course.code) is None): raise ValidationError( instructor, 'course code {} does not match given regex {}'.format( class_.course.code, self.config.course_code_regex ) ) if 'department' in instructor and 'departments' in self.config: dept_codes = {d.code for d in self.config.departments} if instructor.department not in dept_codes: raise ValidationError( instructor, 'department {} not listed in config.json'.format( instructor.department ) ) if 'homepage' in instructor: try: self.validate_homepage(instructor.homepage) except ValidationError as e: message = 'instructor {} office, {}'.format(instructor.name) raise ValidationError(message, *e.args) if 'office' in instructor: try: if 'location' in instructor.office: self.validate_location(instructor.office.location) for office_hour in instructor.office.get('hours', []): self.validate_meeting(office_hour) except ValidationError as e: message = 'instructor {} office, {}'.format(instructor.name) raise ValidationError(message, *e.args) def validate_final_exam(self, final_exam): """Validate final exam. NOTE: currently unused. Args: final_exam (DotDict): Final Exam object to validate. Raises: ValidationError: Invalid final exam. """ if 'kind' in final_exam and final_exam.kind != 'final_exam': raise ValidationError( final_exam, 'final_exam object must be of kind "final_exam"' ) try: self.validate_meeting(final_exam.meeting) except ValidationError as e: raise ValidationError(final_exam, *e.args) def validate_textbook_link(self, textbook_link): """Validate textbook link. Args: textbook_link (DotDict): Textbook link object to validate. Raises: ValidationError: Invalid textbook link. """ if 'course' not in textbook_link: return if self.course_code_regex.match(textbook_link.course.code) is not None: return raise ValidationError( textbook_link, 'textbook_link course code doent match course code regex' ) def validate_location(self, location): """Validate location. Args: location (DotDict): Location object to validate. Raises: ValidationWarning: Invalid location. """ if 'campus' in location and 'campuses' in self.config: if location.campus not in self.config.campuses: raise ValidationWarning( location, 'campus {} not in config'.format(location.campus), ) if 'building' in location and 'buildings' in self.config: if location.building not in self.config.buildings: raise ValidationWarning( location, 'building {} not in config'.format(location.building), ) @staticmethod def validate_website(url): """Validate url by sending HEAD request and analyzing response. Args: url (str): URL to validate. Raises: ValidationError: URL is invalid. """ c = httplib.HTTPConnection(url) c.request('HEAD', '') # NOTE: 200 - good status # 301 - redirected if c.getresponse().status == 200 or c.getresponse().status == 301: return raise ValidationError(url, 'invalid website w/url "%s"'.format(url)) def validate_time_range(self, start, end): """Validate start time and end time. There exists an unhandled case if the end time is midnight. Args: start (str): Start time. end (str): End time. Raises: ValidationError: Time range is invalid. """ try: start, end = map(dparser.parse, [start, end]) except ValueError: raise ValidationError('invalid time format {}-{}'.format(start, end)) if start > end: raise ValidationError('start {} > end {}'.format(start, end)) elif start == end: pass # TODO - this should be reported # raise ValidationWarning('start {} = end {}'.format(start, end)) # NOTE: there exists an unhandled case if the end time is midnight. def validate_directory(self, directory): """Validate directory. Args: directory (str, dict): Directory to validate. May be either path or object. Raises: ValidationError: encapsulated IOError """ if isinstance(directory, str): try: name = directory directory = dir_to_dict(directory) directory['name'] = name except IOError as e: raise ValidationError(str(e)) Validator.schema_validate(directory, *Validator.SCHEMAS.directory)
class Validator: """Validation engine in parsing data pipeline. Attributes: config (DotDict): Loaded config.json. course_code_regex (re): Regex to match course code. kind_to_validation_function (dict): Map kind to validation function defined within this class. KINDS (set): Kinds of objects that validator validates. relative (bool): Enforce relative ordering in validation. seen (dict): Running monitor of seen courses and sections tracker (parsing.parsing_library.tracker.Tracker): Tracker. """ KINDS = { 'config', 'datalist', 'course', 'section', 'meeting', 'directory', 'eval', 'instructor', 'final_exam', 'textbook', 'textbook_link', } def __init__(self, config_path, tracker=None, relative=True): """Construct validator instance. Args: config_path (str): School config file path. tracker (None, optional): Description relative (bool, optional): Enforce relative ordering in validation. """ Validator.load_schemas() self.kind_to_validation_function = { kind: getattr(self, 'validate_' + kind) if hasattr(self, 'validate_' + kind) else lambda *_, **__: None for kind in Validator.KINDS } self.config = DotDict(Validator.file_to_json(config_path)) self.config['kind'] = 'config' self.validate(self.config) self.course_code_regex = re.compile(self.config.course_code_regex) self.relative = relative # Running monitor of validated course and section codes. self.seen = {} if tracker is None: # Used during self-contained validation. self.tracker = Tracker() self.tracker.school = self.config.school.code self.tracker.mode = 'validating' self.tracker.start() else: self.tracker = tracker @classmethod def load_schemas(cls, schema_path=None): """Load JSON validation schemas. NOTE: Will load schemas as static variable (i.e. once per definition), unless schema_path is specifically defined. Args: schema_path (None, str, optional): Override default schema_path """ if hasattr(cls, 'SCHEMAS') and schema_path is None: return if schema_path is None: schema_path = '{}/{}/library/schemas'.format( settings.BASE_DIR, settings.PARSING_DIR) def load(kind): filepath = '{}/{}.json'.format(schema_path, kind) with open(filepath, 'r') as file: schema = json.load(file) resolved = jsonschema.RefResolver('file://{}/'.format(schema_path), schema) return (schema, resolved) cls.SCHEMAS = DotDict({kind: load(kind) for kind in cls.KINDS}) @staticmethod def schema_validate(data, schema, resolver=None): """Validate data object with JSON schema alone. Args: data (dict): Data object to validate schema (TYPE): JSON schema to validate against. resolver (None, optional): JSON Schema reference resolution. Raises: jsonschema.exceptions.ValidationError: Invalid object. """ jsonschema.Draft4Validator(schema, resolver=resolver).validate(data) # TODO - Create iter_errors from jsonschema validator # NOTE: if modifying schemas it may be prudent to catch: # jsonschema.exceptions.SchemaError # jsonschema.exceptions.RefResolutionError @staticmethod def file_to_json(path, allow_duplicates=False): """Load file pointed to by path into json object dictionary. Args: path (str): allow_duplicates (bool, optional): Allow duplicate keys in JSON. Returns: dict: JSON-compliant dictionary. """ def raise_on_duplicates(ordered_pairs): """Reject duplicate keys in dictionary.""" d = {} for k, v in ordered_pairs: if k in d: raise JsonValidationError("duplicate key: %r" % (k, )) d[k] = v return d with open(path, 'r') as f: if allow_duplicates: return json.load(f) return json.load(f, object_pairs_hook=raise_on_duplicates) def validate(self, data): """Validation entry/dispatcher. Args: data (list, dict): Data to validate. """ for obj in make_list(data): obj = DotDict(obj) Validator.schema_validate(obj, *Validator.SCHEMAS[obj.kind]) self.kind_to_validation_function[obj.kind](obj) def validate_self_contained(self, data_path, break_on_error=True, break_on_warning=False, output_error=None, display_progress_bar=True, master_log_path=None): """Validate JSON file as without ingestor. Args: data_path (str): Path to data file. break_on_error (bool, optional) break_on_warning (bool, optional) output_error (None, optional): Error output file path. display_progress_bar (bool, optional) Raises: e: TODO """ # TODO - iter errors and catch exceptions within method if display_progress_bar: self.tracker.add_viewer(ProgressBar('{total}')) logger = Logger(errorfile=output_error) try: # self.validate_directory(directory) data = Validator.file_to_json(data_path)['$data'] Validator.schema_validate(data, *Validator.SCHEMAS.datalist) except (JsonValidationError, json.scanner.JSONDecodeError) as e: logger.log(e) raise e # fatal error, cannot continue for obj in data: obj = DotDict(obj) try: self.kind_to_validation_function[obj.kind](obj) self.tracker.status = dict(kind=obj.kind, status='valid') except JsonValidationError as e: logger.log(e) if break_on_error: raise e except JsonValidationWarning as e: logger.log(e) if break_on_warning: raise e self.tracker.status = dict(kind=obj.kind, status='total') self.tracker.end() def validate_course(self, course): """Validate course. Args: course (DotDict): Course object to validate. Raises: JsonDuplicationWarning: TODO JsonValidationError: Invalid course. """ if 'kind' in course and course.kind != 'course': raise JsonValidationError('course object must be of kind course', course) if self.course_code_regex.match(course.code) is None: raise JsonValidationError( "course code {} does not match r'{}'".format( course.code, self.config.course_code_regex), course) if ('department' in course and 'code' in course.department and 'departments' in self.config): department_codes = {d.code for d in self.config.departments} if course.department.code not in department_codes: raise JsonValidationError( 'department {} is not in config.json departments'.format( course.department), course) if 'homepage' in course: self.validate_website(course.homepage) for section in course.get('sections', []): if 'course' in section and section.course.code != course.code: raise JsonValidationError( 'nested {} does not match parent {}'.format( section.course.code, course.code), course) # NOTE: mutating dictionary section.course = {'code': course.code} self.validate_section(section) if not self.relative: return if course.code in self.seen: raise JsonDuplicationWarning( 'multiple definitions of course {}'.format(course.code), course) self.seen.setdefault(course.code, {}) def validate_section(self, section): """Validate section object. Args: section (DotDict): Section object to validate. Raises: JsonDuplicationWarning: TODO JsonValidationError: Invalid section. """ if 'course' not in section: raise JsonValidationError('section doesnt define a parent course', section) if 'kind' in section and section.kind != 'section': raise JsonValidationError('section must be of kind "section"', section) if ('course' in section and self.course_code_regex.match(section.course.code) is None): raise JsonValidationError( 'course code {} does not match r\'{}\''.format( section.course.code, self.config.course_code_regex), section) if 'term' in section and section.term not in self.config.terms: raise JsonValidationError( 'term {} not in config.json term list'.format(section.term), section) if 'instructors' in section: db_instructor_textfield_size = 500 if len(', '.join(instructor['name'] for instructor in section. instructors)) > db_instructor_textfield_size: raise JsonValidationError( 'db field too small for comma-joined instructor names', section) for instructor in section.get('instructors', []): self.validate_instructor(instructor) if 'final_exam' in section: if ('course' in section.final_exam and section.final_exam.course.code != section.course.code): raise JsonValidationError( 'final exam course {} doesnt match course code {}'.format( section.final_exam.course.code, section.course.code), section) if ('section' in section.final_exam and section.final_exam.section.code != section.code): raise JsonValidationError( 'final exam section {} doesnt match section {}'.format( section.final_exam.section.code, section.code), section) # final_exam['course'] = section.course # final_exam['section'] = {'code': section.code} # self.validate_final_exam(section.final_exam) for meeting in section.get('meetings', []): if ('course' in meeting and meeting.course.code != section.course.code): raise JsonValidationError( 'course code {} in meeting doesnt match parent section \ course code {}'.format(meeting.course.code, section.course.code), section) if 'section' in meeting and meeting.section.code != section.code: raise JsonValidationError( 'section code {} in nested meeting doesnt match parent \ section code {}'.format(meeting.section.code, section.code), section) # NOTE: mutating obj meeting.course = section.course meeting.section = {'code': section.code} self.validate_meeting(meeting) if 'textbooks' in section: for textbook in section.textbooks: self.validate_textbook_link(textbook) if not self.relative: return if section.course.code not in self.seen: raise JsonValidationError( 'course code {} isnt defined'.format(section.course.code), section) elif (section.code in self.seen[section.course.code] and section.year in self.seen[section.course.code][section.code] and section.term in self.seen[section.course.code][section.code][section.year]): raise JsonDuplicationWarning( 'multiple defs for {} {} - {} already defined'.format( section.course.code, section.code, section.year), section) update(self.seen[section.course.code], {section.code: { section.year: section.term }}) def validate_meeting(self, meeting): """Validate meeting object. Args: meeting (DotDict): Meeting object to validate. Raises: e: TODO JsonValidationError: Invalid meeting. """ if 'kind' in meeting and meeting.kind != 'meeting': raise JsonValidationError('meeting object must be kind instructor', meeting) if ('course' in meeting and self.course_code_regex.match(meeting.course.code) is None): raise JsonValidationError( 'course code {} does not match regex \'{}\''.format( meeting.course.code, self.config.course_code_regex), meeting) if 'time' in meeting: try: self.validate_time_range(meeting.time.start, meeting.time.end) except (JsonValidationError, JsonValidationWarning) as e: e.message = 'meeting for {} {}, '.format( meeting.course.code, meeting.section.code) + e.message raise e if 'location' in meeting: try: self.validate_location(meeting.location) except JsonValidationError as e: e.message = 'meeting for {} {}, '.format( meeting.course.code, meeting.section.code) + e.message raise e if not self.relative: return if meeting.course.code not in self.seen: raise JsonValidationError( 'course code {} isnt defined'.format(meeting.course.code), meeting) if meeting.section.code not in self.seen[meeting.course.code]: raise JsonValidationError( 'section {} isnt defined'.format(meeting.section.code), meeting) def validate_eval(self, course_eval): """Validate evaluation object. Args: course_eval (DotDict): Evaluation to validate. Raises: JsonValidationError: Invalid evaulation. """ if not isinstance(course_eval, DotDict): course_eval = DotDict(course_eval) if self.course_code_regex.match(course_eval.course.code) is None: raise JsonValidationError( "course code {} does not match r'{}'".format( course_eval.course.code, self.config.course_code_regex), course_eval) def validate_instructor(self, instructor): """Validate instructor object. Args: instructor (DotDict): Instructor object to validate. Raises: e: TODO JsonValidationError: Invalid instructor. """ if 'kind' in instructor and instructor.kind != 'instructor': raise JsonValidationError( 'instructor object must be of kind instructor', instructor) for class_ in instructor.get('classes', []): if ('course' in class_ and self.course_code_regex.match(class_.course.code) is None): raise JsonValidationError( 'course code {} does not match given regex {}'.format( class_.course.code, self.config.course_code_regex), instructor) if 'department' in instructor and 'departments' in self.config: dept_codes = {d.code for d in self.config.departments} if instructor.department not in dept_codes: raise JsonValidationError( 'department {} not listed in config.json'.format( instructor.department), instructor) if 'homepage' in instructor: try: self.validate_homepage(instructor.homepage) except JsonValidationError as e: e.message = '@instructor {} office, {}'.format( instructor.name, e.message) raise e if 'office' in instructor: try: if 'location' in instructor.office: self.validate_location(instructor.office.location) for office_hour in instructor.office.get('hours', []): self.validate_meeting(office_hour) except JsonValidationError as e: e.message = '@instructor {} office, {}'.format( instructor.name, e.message) raise e def validate_final_exam(self, final_exam): """Validate final exam. Args: final_exam (DotDict): Final Exam object to validate. Raises: e: TODO JsonValidationError: Invalid Final Exam. """ if 'kind' in final_exam and final_exam.kind != 'final_exam': raise JsonValidationError( 'final_exam object must be of kind "final_exam"', final_exam) try: self.validate_meeting(final_exam.meeting) except JsonValidationError as e: e.message = '@final_exam ' + e.message raise e def validate_textbook_link(self, textbook_link): """Validate textbook link. Args: textbook_link (DotDict): Textbook link object to validate. Raises: JsonValidationError: Invalid textbook link. """ if 'course' not in textbook_link: return if self.course_code_regex.match(textbook_link.course.code) is not None: return raise JsonValidationError( 'textbook_link course code doent match course code regex', textbook_link) def validate_location(self, location): """Validate location. Args: location (DotDict): Location object to validate. Raises: JsonValidationWarning: Invalid location. """ if 'campus' in location and 'campuses' in self.config: if location.campus not in self.config.campuses: raise JsonValidationWarning( 'campus {} not in config'.format(location.campus), location) if 'building' in location and 'buildings' in self.config: if location.building not in self.config.buildings: raise JsonValidationWarning( 'building {} not in config'.format(location.building), location) @staticmethod def validate_website(url): """Validate url by sending HEAD request and analyzing response. Args: url (str): URL to validate. Raises: JsonValidationError: If URL is invalid. """ c = httplib.HTTPConnection(url) c.request('HEAD', '') # NOTE: 200 - good status # 301 - redirected if c.getresponse().status == 200 or c.getresponse().status == 301: return raise JsonValidationError('invalid website w/url "%s"'.format(url), {'url': url}) def validate_time_range(self, start, end): """Validate start time is less than end time. There exists an unhandled case if the end time is midnight. Args: start (str): Start time. end (str): End time. Raises: JsonValidationError: If time range invalid. """ try: start, end = map(dparser.parse, [start, end]) except ValueError: raise JsonValidationError('invalid time range {}-{}', start, end) if start >= end: raise JsonValidationError('start {} >= end {}', start, end) # NOTE: there exists an unhandled case if the end time is midnight. def validate_directory(self, directory): """Validate directory. Args: directory (str, dict): Directory to validate. May be either path or object. Raises: e: TODO """ if isinstance(directory, str): try: name = directory directory = dir_to_dict(directory) directory['name'] = name except IOError as e: print('ERROR: invalid directory path\n' + str(e), file=sys.stderr) raise e Validator.schema_validate(directory, *Validator.SCHEMAS.directory)