Exemple #1
0
    def run(self, tracker, school, data_type, options):
        """Run the command."""
        tracker.school = school
        tracker.mode = "validating"
        if options["display_progress_bar"]:
            tracker.add_viewer(
                StatProgressBar("{valid}/{total}", statistics=self.stat_view),
                name="progressbar",
            )
        logger = logging.getLogger("parsing.schools." + school)
        logger.debug("Digest command options:" + str(options))

        # Load config file to dictionary.
        if isinstance(options["config"], str):
            with open(options["config"].format(school=school, type=data_type),
                      "r") as file:
                options["config"] = json.load(file)

        try:
            Validator(options["config"],
                      tracker=tracker).validate_self_contained(
                          options["data"].format(school=school,
                                                 type=data_type),
                          break_on_error=True,
                          break_on_warning=options.get("break_on_warning"),
                          display_progress_bar=options["display_progress_bar"],
                      )
        except (ValidationError, ValidationWarning, Exception):
            logging.exception("Failed validation before digestion")
            return  # Skip digestion for this school.

        if options["display_progress_bar"]:
            tracker.remove_viewer("progressbar")
            tracker.add_viewer(ETAProgressBar(), name="progressbar")
        tracker.mode = "digesting"

        with open(options["data"].format(school=school, type=data_type),
                  "r") as file:
            data = json.load(file)

        try:
            Digestor(school, meta=data["$meta"], tracker=tracker).digest(
                data["$data"],
                diff=options["diff"],
                load=options["load"],
                output=options["output_diff"].format(school=school,
                                                     type=data_type),
            )

        except DigestionError:
            logging.exception("Failed digestion")
        except PipelineException:
            logging.expection("Failed digestion w/in pipeline")
        except Exception:
            logging.exception("Failed digestion with uncaught exception")

        logging.info("Digestion overview for " + school + ": " +
                     str(self.stat_view.report()))
Exemple #2
0
    def __init__(self,
                 school,
                 config_path,
                 output_path,
                 output_error_path,
                 break_on_error=True,
                 break_on_warning=False,
                 display_progress_bar=True,
                 skip_duplicates=True,
                 validate=True,
                 tracker=NullTracker()):
        """Construct ingestor object and resolve options.

        Args:
            school (string): The school code (e.g. jhu, gw, umich).
            config_path (str): Configuration file path.
            output_path (str): Output path.
            output_error_path (str): Error output path.
            break_on_error (bool, optional): Stop ingesting on error.
            break_on_warning (bool, optional): Stop ingesting on warning.
            display_progress_bar (bool, optional): display progress bar
            skip_duplicates (bool, optional): Skip ingesting courses
                that have already been seen.
            validate (bool, optional): Perform validation.
            tracker (library.tracker, optional): tracker object
        """
        self.school = school
        self.validate = validate
        self.break_on_error = break_on_error
        self.break_on_warning = break_on_warning
        self.skip_duplicates = skip_duplicates
        self.tracker = tracker
        self.hoarder = Hoarder()
        self.tracker.add_viewer(self.hoarder)
        self.tracker.school = school

        # Initialize loggers for json and errors.
        self.json = JSONStreamWriter(output_path, type_=dict).enter()
        self.data_list = self.json.write('$data', type_=list).enter()
        self.logger = Logger(errorfile=output_error_path)
        if self.validate:
            self.validator = Validator(config_path, tracker=self.tracker)

        # Inherit dictionary functionality.
        super(Ingestor, self).__init__()
Exemple #3
0
    def run(self, tracker, school, data_type, options):
        tracker.school = school

        tracker.mode = 'validating'

        try:
            Validator(options['config_file'].format(school=school,
                                                    type=data_type),
                      tracker=tracker).validate_self_contained(
                          options['data'].format(school=school,
                                                 type=data_type),
                          break_on_error=True,
                          break_on_warning=options.get('break_on_warning'),
                          output_error=options.get('output_error').format(
                              school=school, type=data_type),
                          display_progress_bar=options['display_progress_bar'])
        except JsonException:
            print('FAILED VALIDATION', file=sys.stderr)
            return  # Skip digestion for this school.

        tracker.mode = 'digesting'

        try:
            Digestor(school,
                     data=options['data'].format(school=school,
                                                 type=data_type),
                     output=options['output_diff'].format(school=school,
                                                          type=data_type),
                     diff=options['diff'],
                     load=options['load'],
                     display_progress_bar=options['display_progress_bar'],
                     tracker=tracker).digest()

        except DigestionError as e:
            self.stderr.write(self.style.ERROR('FAILED: digestion'))
            self.stderr.write(str(e))
            tracker.see_error(str(e) + '\n' + traceback.format_exc())
        except Exception as e:
            self.stderr.write(self.style.ERROR('FAILED: digestion'))
            self.stderr.write(traceback.format_exc())
            tracker.see_error(traceback.format_exc())
Exemple #4
0
    def __init__(self, config, output,
                 break_on_error=True,
                 break_on_warning=False,
                 display_progress_bar=True,
                 skip_duplicates=True,
                 validate=True,
                 tracker=NullTracker()):
        """Construct ingestor object and resolve options.

        Args:
            school (string): The school code (e.g. jhu, gw, umich).
            config (dict): Configuration dictionary.
            output (str, file): Output path or file object.
            break_on_error (bool, optional): Stop ingesting on error.
            break_on_warning (bool, optional): Stop ingesting on warning.
            display_progress_bar (bool, optional): display progress bar
            skip_duplicates (bool, optional): Skip ingesting courses
                that have already been seen.
            validate (bool, optional): Perform validation.
            tracker (library.tracker, optional): tracker object
        """
        self.school = config['school']['code']
        self.validate = validate
        self.break_on_error = break_on_error
        self.break_on_warning = break_on_warning
        self.skip_duplicates = skip_duplicates
        self.tracker = tracker
        self.hoarder = Hoarder()
        self.tracker.add_viewer(self.hoarder)
        self.tracker.school = self.school

        # Initialize loggers for json and errors.
        self.json = JSONStreamWriter(output, type_=dict).enter()
        self.data_list = self.json.write('$data', type_=list).enter()
        if self.validate:
            self.validator = Validator(config, tracker=self.tracker)

        # Inherit dictionary functionality.
        super(Ingestor, self).__init__()
Exemple #5
0
    def run(self, options, school, data_type):
        """Run the validator.

        Args:
            options (dict): Command line options for arg parser.
            school (str): School to parse.
            data_type (str): {'courses', 'evals', 'textbooks'}
        """
        try:
            Validator(
                options['config_file'].format(
                    school=school, type=data_type), ).validate_self_contained(
                        options['data'].format(school=school, type=data_type),
                        break_on_error=options.get('break_on_error'),
                        break_on_warning=options.get('break_on_warning'),
                        output_error=options.get('output_error').format(
                            school=school, type=data_type),
                        display_progress_bar=options['display_progress_bar'])
        except JsonException as e:
            self.stdout.write(self.style.ERROR('FAILED VALIDATION ' + school))
            self.stderr.write(str(e))
        except Exception as e:
            self.stdout.write(self.style.ERROR('FAILED VALIDATION ' + school))
            self.stderr.write(traceback.format_exc())
Exemple #6
0
class Ingestor(dict):
    """Ingest parsing data into formatted json.

    Mimics functionality of dict.

    Attributes:
        ALL_KEYS (set): Set of keys supported by Ingestor.
        break_on_error (bool): Break/cont on errors.
        break_on_warning (bool): Break/cont on warnings.
        school (str): School code (e.g. jhu, gw, umich).
        skip_duplicates (bool): Skip ingestion for repeated definitions.
        tracker (library.tracker): Tracker object.
        UNICODE_WHITESPACE (TYPE): regex that matches Unicode whitespace.
        validate (bool): Enable/disable validation.
        validator (library.validator): Validator instance.
    """

    ALL_KEYS = {
        'school',
        'school_subdivision_code', 'school_subdivision_name',
        'kind',
        'department',
        'dept',
        'department_name',
        'department_code',
        'dept_name',
        'dept_code',
        'code', 'course_code', 'course',
        'name',
        'course_name',
        'prerequisites',
        'prereqs',
        'corequisites',
        'coreqs',
        'exclusions',
        'description',
        'descr',
        'areas',
        'level',
        'cores',
        'geneds',
        'homepage',
        'website',
        'instructors',
        'instructors', 'instructor', 'instr', 'instrs', 'instr_name',
        'instr_names', 'instructor', 'instructor_name',
        'section', 'sections', 'section_code', 'section_name',
        'meeting_section',
        'section_type', 'type',
        'term',
        'semester',
        'year',
        'instructors',
        'capacity', 'size',
        'enrollment', 'enrolment',
        'waitlist', 'waitlist_size',
        'remaining_seats',
        'fees', 'fee', 'cost',
        'final_exam',
        'offerings', 'meetings',
        'time_start', 'start_time',
        'time_end', 'end_time',
        'location',
        'loc', 'where',
        'days', 'day', 'dates', 'date',
        'time',
        'credits', 'num_credits',
        'campus',  # TODO - not really
        'textbooks', 'isbn', 'required',
        'detail_url', 'image_url', 'author', 'title',
        'score',
        'summary',
        'same_as',
    }

    def __init__(self, config, output,
                 break_on_error=True,
                 break_on_warning=False,
                 display_progress_bar=True,
                 skip_duplicates=True,
                 validate=True,
                 tracker=NullTracker()):
        """Construct ingestor object and resolve options.

        Args:
            school (string): The school code (e.g. jhu, gw, umich).
            config (dict): Configuration dictionary.
            output (str, file): Output path or file object.
            break_on_error (bool, optional): Stop ingesting on error.
            break_on_warning (bool, optional): Stop ingesting on warning.
            display_progress_bar (bool, optional): display progress bar
            skip_duplicates (bool, optional): Skip ingesting courses
                that have already been seen.
            validate (bool, optional): Perform validation.
            tracker (library.tracker, optional): tracker object
        """
        self.school = config['school']['code']
        self.validate = validate
        self.break_on_error = break_on_error
        self.break_on_warning = break_on_warning
        self.skip_duplicates = skip_duplicates
        self.tracker = tracker
        self.hoarder = Hoarder()
        self.tracker.add_viewer(self.hoarder)
        self.tracker.school = self.school

        # Initialize loggers for json and errors.
        self.json = JSONStreamWriter(output, type_=dict).enter()
        self.data_list = self.json.write('$data', type_=list).enter()
        if self.validate:
            self.validator = Validator(config, tracker=self.tracker)

        # Inherit dictionary functionality.
        super(Ingestor, self).__init__()

    def _get(self, *keys, **kwargs):
        """Match the first key found in self dictionary.

        Note that this is purposefully not an override to __get__.
        This allows the Ingestor to maintain dictionary-like
        functionality for the API user while internally checking itself.

        Args:
            *keys: The list of keys.
            **kwargs: default return option
                TODO - Change if update to Python3

        Returns:
            The value of the key in the Ingestor instance.

        Raises:
            IngestorError: Enforce Ingestor.ALL_KEYS
        """
        default = kwargs.get('default')
        for key in keys:
            if key not in Ingestor.ALL_KEYS:
                raise IngestionWarning(key + ' not in Ingestor.ALL_KEYS')
            if key not in self:
                continue
            return self[key]
        return default

    def _resolve_department(self):
        department = self._get('department')
        if ('department' not in self or
                ('department_name' in self or
                    'department_code' in self or
                    'dept_name' in self or
                    'dept_code' in self)):
            # if not isinstance(self._get('department', 'dept'), dict):
            department = {
                'name': titlize(self._get('department_name', 'dept_name')),
                'code': self._get('department_code', 'dept_code')
            }
        return department

    def _resolve_instructors(self):
        instructors = None
        instr_keys = set(
            [
                'instructors',
                'instructor',
                'instr',
                'instrs',
                'instr_name',
                'instr_names',
                'instructor',
                'instructor_name',
                'instructors'
            ]) & set(self)

        if len(instr_keys) == 1:
            instructors = self[list(instr_keys)[0]]
            instructors = clean(make_list(instructors))
            if instructors is not None:
                for i in range(len(instructors)):
                    if isinstance(instructors[i], basestring):
                        instructors[i] = {'name': instructors[i]}
        elif len(instr_keys) > 1:
            raise IngestionWarning(
                'cannot resolve instructors from keys: {}'.format(
                    ','.join(instr_keys)
                ),
                self
            )
        return instructors

    def _resolve_time(self):
        time = self._get('time')
        if 'time' not in self:
            time = {
                'start': time24(self._get('time_start', 'start_time')),
                'end': time24(self._get('time_end', 'end_time'))
            }
        return time

    def _resolve_location(self):
        location = self._get('location')
        if isinstance(self._get('location', 'loc', 'where'), basestring):
            location = {'where': self._get('location', 'loc', 'where')}
        return location

    def ingest_course(self):
        """Create course json from info in model map.

        Returns:
            dict: course
        """
        course = {
            'kind': 'course',
            'school': {
                'code': self.school,
                'subdivisions': [
                    {
                        'code': self._get('school_subdivision_code'),
                        'name': self._get('school_subdivision_name')
                    }
                ]
            },
            'code': self._get('course_code', 'code', 'course'),
            'name': titlize(self._get('name', 'course_name')),
            'department': self._resolve_department(),
            'credits': safe_cast(self._get('credits', 'num_credits'), float, default=0.),
            'prerequisites': make_list(self._get('prerequisites', 'prereqs')),
            'corequisites': make_list(self._get('corequisites', 'coreqs')),
            'exclusions': make_list(self._get('exclusions')),
            'areas': make_list(self._get('areas')),
            'level': self._get('level'),
            'cores': make_list(self._get('cores')),
            'geneds': make_list(self._get('geneds')),
            'sections': self._get('sections'),
            'homepage': self._get('homepage', 'website'),
            'same_as': make_list(self._get('same_as')),
            'description': self._get('description', 'descr'),
            # 'description': extract_info_from_text(
            #     self._get('description', 'descr'),
            #     inject=self
            # ),
        }

        course = clean(course)
        self._validate_and_log(course)
        if 'department' in course:
            self.tracker.department = course['department']
        return course

    def ingest_section(self, course):
        """Create section json object from info in model map.

        Args:
            course (dict): validated course object

        Returns:
            dict: section
        """
        section = {
            'kind': 'section',
            'course': {
                'code': course.get('code')
            },
            'code': self._get('section_code', 'section',
                              'meeting_section'),
            'name': titlize(self._get('section_name')),
            'term': self._get('term', 'semester'),
            'year': str(self._get('year')),
            'instructors': self._resolve_instructors(),
            'capacity': safe_cast(self._get('capacity', 'size'), int),
            'enrollment': safe_cast(self._get('enrollment', 'enrolment'), int),
            'waitlist': safe_cast(self._get('waitlist'), int),
            'waitlist_size': safe_cast(self._get('waitlist_size'), int),
            'remaining_seats': safe_cast(self._get('remaining_seats'), int),
            'type': self._get('type', 'section_type'),
            'fees': safe_cast(self._get('fees', 'fee', 'cost'), float),
            'final_exam': self._get('final_exam'),
            'textbooks': self._get('textbooks'),
            'meetings': self._get('offerings', 'meetings')
        }

        section = clean(section)
        self._validate_and_log(section)
        self.tracker.year = section['year']
        self.tracker.term = section['term']
        return section

    def ingest_meeting(self, section, clean_only=False):
        """Create meeting ingested json map.

        Args:
            section (dict): validated section object

        Returns:
            dict: meeting
        """
        year = str(self._get('year'))
        term = self._get('term', 'semester')
        if section.get('code') is None:
            year = None
            term = None

        meeting = {
            'kind': 'meeting',
            'course': section.get('course'),
            'section': {
                'code': section.get('code'),
                'year': year,
                'term': term,
            },
            'days': make_list(self._get('days', 'day')),
            'dates': make_list(self._get('dates', 'date')),
            'time': self._resolve_time(),
            'location': self._resolve_location()
        }

        meeting = clean(meeting)

        if clean_only:
            return meeting

        self._validate_and_log(meeting)
        if 'time' in meeting:
            self.tracker.time = meeting['time']['start']
            self.tracker.time = meeting['time']['end']
        return meeting

    def ingest_textbook_link(self, section=None):
        """Create textbook link json object.

        Args:
            section (None, :obj:`dict`, optional): Description
        Returns:
            dict: textbook link.
        """
        textbook_link = {
            'kind': 'textbook_link',
            'school': {
                'code': self._get('school', 'school_code')
            },
            'course': {
                'code': self._get('course_code')
            },
            'section': {
                'code': self._get('section_code'),
                'year': str(self._get('year')),
                'term': self._get('term', 'semester')
            },
            'isbn': self._get('isbn'),
            'required': self._get('required')
        }

        textbook_link = clean(textbook_link)
        self._validate_and_log(textbook_link)
        self.tracker.year = textbook_link['section']['year']
        self.tracker.term = textbook_link['section']['term']
        if 'department' in self:
            self.tracker.department = self['department']
        return textbook_link

    def ingest_textbook(self):
        """Create textbook json object.

        Returns:
            dict: textbook
        """
        textbook = {
            'kind': 'textbook',
            'isbn': self._get('isbn'),
            'detail_url': self._get('detail_url'),
            'image_url': self._get('image_url'),
            'author': self._get('author'),
            'title': self._get('title')
        }

        textbook = clean(textbook)
        self._validate_and_log(textbook)
        if 'department' in self:
            self.tracker.department = self['department']
        return textbook

    def ingest_eval(self):
        """Create evaluation json object.

        Returns:
            dict: eval
        """
        evaluation = {
            'kind': 'eval',
            'year': str(self._get('year')),
            'term': self._get('term'),
            'score': float(self._get('score')),
            'instructors': self._resolve_instructors(),
            'course': {
                'code': self._get('course_code')
            }
        }

        evaluation = clean(evaluation)
        self._validate_and_log(evaluation)
        self.tracker.year = evaluation['year']
        self.tracker.term = evaluation['term']
        return evaluation

    def end(self):
        """Finish ingesting.

        Close i/o, clear internal state, write meta info
        """
        self.data_list.exit()
        self.json.write('$meta', {
            '$schools': self.hoarder.schools,
            '$timestamp': self.tracker.start_time
        })
        self.json.exit()
        self.clear()

    def _validate_and_log(self, obj):
        if self.validate is False:
            self.data_list.write(obj)
            self.tracker.stats = dict(kind=obj['kind'], status='total')
            return

        is_valid, skip = self._run_validator(obj)
        if skip:
            return
        if is_valid:
            self.data_list.write(obj)
        try:
            for key in self:
                if key in Ingestor.ALL_KEYS:
                    continue
                raise IngestionWarning(
                    self,
                    'ingestor does not support key {}: {}'.format(key,
                                                                  self[key])
                )
        except IngestionWarning as e:
            is_valid = True
            logging.exception('Ingestor warning')
            if self.break_on_warning:
                raise e
        self.tracker.stats = dict(kind=obj['kind'], status='total')

    def _run_validator(self, data):
        is_valid = False
        full_skip = False

        logger = logging.getLogger('parsing.schools.' + self.school)

        try:
            self.validator.validate(data)
            self.tracker.stats = dict(kind=data['kind'], status='valid')
            is_valid = True
        except ValidationError as e:
            if self.break_on_error:
                raise ValidationError(*e.args)
            else:
                logger.warning('Ingestion failed', exc_info=True)
                logger.debug('Ingestor dump', self)
        except ValidationWarning as e:
            if (isinstance(e, MultipleDefinitionsWarning) and
                    self.skip_duplicates):
                full_skip = True
            else:
                is_valid = True
                if self.break_on_warning:
                    raise ValidationWarning(*e.args)
                else:
                    logger.warning('Validation warning', exc_info=True)
                    logger.debug('Ingestor dump', self)

        return is_valid, full_skip
Exemple #7
0
    def test_validator_nested(self):
        validator = Validator(ValidationTest.config)
        nested_course = {
            'kind': 'course',
            'school': {
                'code': 'test'
            },
            'code': 'ABC',
            'name': 'Alphabet',
            'department': {
                'code': 'GHI',
                'name': 'English'
            },
            'credits': 3.,
            'prerequisites': ['ABC', 'DEF'],
            'corequisites': ['A', 'AB', 'BC', 'B', 'C'],
            'homepage': 'www.google.com',
            'same_as': ['ABD'],
            'description': 'Um, hi hello',
            'sections': [
                {
                    'code': '001',
                    'term': 'Bar',
                    'year': '2017',
                    'instructors': [
                        {
                            'name': {
                                'first': 'Sem',
                                'last': 'Ly'
                            }
                        },
                        {
                            'name': 'Semesterly'
                        }
                    ],
                    'capacity': 42,
                    'enrollment': 41,
                    'waitlist': 0,
                    'waitlist_size': 100,
                    'type': 'Lecture',
                    'fees': 50.,
                },
                {
                    'code': '002',
                    'term': 'Bar',
                    'year': '2017',
                    'instructors': [
                        {
                            'name': 'Semesterly'
                        }
                    ],
                    'capacity': 40,
                    'enrollment': 36,
                    'waitlist': 0,
                    'waitlist_size': 100,
                    'type': 'Lecture',
                    'fees': 50.,
                    'meetings': [
                        {
                            'days': ['M', 'F'],
                            'time': {
                                'start': '14:00',
                                'end': '14:50'
                            },
                            'location': {
                                'campus': 'Homewood',
                                'building': 'Malone',
                                'room': 'Ugrad'
                            }
                        },
                        {
                            'days': ['W'],
                            'time': {
                                'start': '10:00',
                                'end': '12:15'
                            }
                        }
                    ]
                }
            ]
        }

        with self.assertRaises(ValidationError):
            invalid = deepcopy(nested_course)
            invalid['sections'][0]['course'] = {'code': 'ABD'}
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(nested_course)
            invalid['sections'][1]['meetings'][1]['course'] = {'code': 'ABD'}
            validator.validate(invalid)
        with self.assertRaises(MultipleDefinitionsWarning):
            invalid = deepcopy(nested_course)
            invalid['sections'][1]['code'] = '001'
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(nested_course)
            invalid['sections'][1]['meetings'][1]['days'] = None
            validator.validate(invalid)

        validator.validate(nested_course)
        with self.assertRaises(MultipleDefinitionsWarning):
            validator.validate(nested_course)
Exemple #8
0
    def test_validator_flat(self):
        config_required = {
            'school',
            'course_code_regex',
            'terms',
            'single_access',
            'granularity',
            'full_academic_year_registration',
            'active_semesters',
            'ampm'
        }

        for req in config_required:
            invalid_config = {
                k: v for k, v in ValidationTest.config.items() if k != req
            }
            with self.assertRaises(ValidationError):
                Validator(invalid_config)
        validator = Validator(ValidationTest.config)
        course = {
            'kind': 'course',
            'school': {
                'code': 'test'
            },
            'code': 'ABC',
            'name': 'Alphabet',
            'department': {
                'code': 'GHI',
                'name': 'English'
            },
            'credits': 3.,
            'prerequisites': ['ABC', 'DEF'],
            'corequisites': ['A', 'AB', 'BC', 'B', 'C'],
            'homepage': 'www.google.com',
            'same_as': ['ABD'],
            'description': 'Um, hi hello',
        }
        with self.assertRaises(ValidationError):
            invalid = deepcopy(course)
            invalid['school']['code'] = 'nottest'
            validator.validate(invalid)
        # with self.assertRaises(ValidationError):
        #     invalid = deepcopy(course)
        #     invalid['same_as'].append('abc')
        #     validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(course)
            invalid['code'] = 'abc'
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(course)
            invalid['code'] = 'abc'
            validator.validate(invalid)
        validator.validate(course)
        with self.assertRaises(MultipleDefinitionsWarning):
            validator.validate(course)

        section = {
            'kind': 'section',
            'course': {
                'code': 'ABC',
            },
            'code': '001',
            'term': 'Bar',
            'year': '2017',
            'instructors': [
                {
                    'name': {
                        'first': 'Sem',
                        'last': 'Ly'
                    }
                },
                {
                    'name': 'Semesterly'
                }
            ],
            'capacity': 42,
            'enrollment': 41,
            'waitlist': 0,
            'waitlist_size': 100,
            'type': 'Lecture',
            'fees': 50.,
        }

        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid['course']['code'] = 'ABD'
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid['term'] = 'NotInConfig'
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid['capacity'] = -1
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid['enrollment'] = -1
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid['fees'] = 'NotAFloat'
            validator.validate(invalid)
        validator.validate(section)
        with self.assertRaises(MultipleDefinitionsWarning):
            validator.validate(section)

        meeting = {
            'kind': 'meeting',
            'course': {
                'code': 'ABC'
            },
            'section': {
                'code': '001',
                'year': '2017',
                'term': 'Bar'
            },
            'days': ['M', 'W', 'F'],
            'time': {
                'start': '14:00',
                'end': '14:50'
            },
            'location': {
                'campus': 'Homewood',
                'building': 'Malone',
                'room': 'Ugrad'
            }
        }

        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid['course']['code'] = 'ABD'
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid['section']['code'] = '002'
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid['section']['term'] = 'InvalidTerm'
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid['section']['year'] = '2018'
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid['time']['start'] = '15:00'
            validator.validate(invalid)
        # with self.assertRaises(ValidationWarning):
        #     invalid = deepcopy(meeting)
        #     invalid['time']['start'] = '14:50'
        #     validator.validate(invalid)
        with self.assertRaises(ValidationWarning):
            invalid = deepcopy(meeting)
            invalid['location']['campus'] = 'NotInConfigList'
            validator.validate(invalid)
        validator.validate(meeting)

        textbook_link = {
            'kind': 'textbook_link',
            'school': {
                'code': 'test'
            },
            'course': {
                'code': 'ABC'
            },
            'section': {
                'code': '001',
                'year': '2017',
                'term': 'Bar'
            },
            'isbn': '9780262033848',
            'required': True
        }

        with self.assertRaises(ValidationError):
            invalid = deepcopy(textbook_link)
            invalid['course']['code'] = 'abc'
            validator.validate(invalid)
        validator.validate(textbook_link)
Exemple #9
0
    def test_validator_nested(self):
        validator = Validator(ValidationTest.config)
        nested_course = {
            "kind":
            "course",
            "school": {
                "code": "test"
            },
            "code":
            "ABC",
            "name":
            "Alphabet",
            "department": {
                "code": "GHI",
                "name": "English"
            },
            "credits":
            3.0,
            "prerequisites": ["ABC", "DEF"],
            "corequisites": ["A", "AB", "BC", "B", "C"],
            "homepage":
            "www.google.com",
            "same_as": ["ABD"],
            "description":
            "Um, hi hello",
            "sections": [
                {
                    "code":
                    "001",
                    "term":
                    "Bar",
                    "year":
                    "2017",
                    "instructors": [
                        {
                            "name": {
                                "first": "Sem",
                                "last": "Ly"
                            }
                        },
                        {
                            "name": "Semesterly"
                        },
                    ],
                    "capacity":
                    42,
                    "enrollment":
                    41,
                    "waitlist":
                    0,
                    "waitlist_size":
                    100,
                    "type":
                    "Lecture",
                    "fees":
                    50.0,
                },
                {
                    "code":
                    "002",
                    "term":
                    "Bar",
                    "year":
                    "2017",
                    "instructors": [{
                        "name": "Semesterly"
                    }],
                    "capacity":
                    40,
                    "enrollment":
                    36,
                    "waitlist":
                    0,
                    "waitlist_size":
                    100,
                    "type":
                    "Lecture",
                    "fees":
                    50.0,
                    "meetings": [
                        {
                            "days": ["M", "F"],
                            "dates": {
                                "start": "08-29-2017",
                                "end": "12-10-2017",
                            },
                            "time": {
                                "start": "14:00",
                                "end": "14:50"
                            },
                            "location": {
                                "campus": "Homewood",
                                "building": "Malone",
                                "room": "Ugrad",
                            },
                            "is_short_course": False,
                        },
                        {
                            "days": ["W"],
                            "dates": {
                                "start": "08-29-2017",
                                "end": "12-10-2017",
                            },
                            "time": {
                                "start": "10:00",
                                "end": "12:15"
                            },
                            "is_short_course": False,
                        },
                    ],
                },
            ],
        }

        with self.assertRaises(ValidationError):
            invalid = deepcopy(nested_course)
            invalid["sections"][0]["course"] = {"code": "ABD"}
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(nested_course)
            invalid["sections"][1]["meetings"][1]["course"] = {"code": "ABD"}
            validator.validate(invalid)
        with self.assertRaises(MultipleDefinitionsWarning):
            invalid = deepcopy(nested_course)
            invalid["sections"][1]["code"] = "001"
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(nested_course)
            invalid["sections"][1]["meetings"][1]["days"] = None
            validator.validate(invalid)

        validator.validate(nested_course)
        with self.assertRaises(MultipleDefinitionsWarning):
            validator.validate(nested_course)
Exemple #10
0
    def test_validator_flat(self):
        config_required = {
            "school",
            "course_code_regex",
            "terms",
            "single_access",
            "granularity",
            "full_academic_year_registration",
            "active_semesters",
            "ampm",
        }

        for req in config_required:
            invalid_config = {
                k: v
                for k, v in list(ValidationTest.config.items()) if k != req
            }
            with self.assertRaises(ValidationError):
                Validator(invalid_config)
        validator = Validator(ValidationTest.config)
        course = {
            "kind": "course",
            "school": {
                "code": "test"
            },
            "code": "ABC",
            "name": "Alphabet",
            "department": {
                "code": "GHI",
                "name": "English"
            },
            "credits": 3.0,
            "prerequisites": ["ABC", "DEF"],
            "corequisites": ["A", "AB", "BC", "B", "C"],
            "homepage": "www.google.com",
            "same_as": ["ABD"],
            "description": "Um, hi hello",
        }
        with self.assertRaises(ValidationError):
            invalid = deepcopy(course)
            invalid["school"]["code"] = "nottest"
            validator.validate(invalid)
        # with self.assertRaises(ValidationError):
        #     invalid = deepcopy(course)
        #     invalid['same_as'].append('abc')
        #     validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(course)
            invalid["code"] = "abc"
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(course)
            invalid["code"] = "abc"
            validator.validate(invalid)
        validator.validate(course)
        with self.assertRaises(MultipleDefinitionsWarning):
            validator.validate(course)

        section = {
            "kind":
            "section",
            "course": {
                "code": "ABC",
            },
            "code":
            "001",
            "term":
            "Bar",
            "year":
            "2017",
            "instructors": [
                {
                    "name": {
                        "first": "Sem",
                        "last": "Ly"
                    }
                },
                {
                    "name": "Semesterly"
                },
            ],
            "capacity":
            42,
            "enrollment":
            41,
            "waitlist":
            0,
            "waitlist_size":
            100,
            "type":
            "Lecture",
            "fees":
            50.0,
        }

        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid["course"]["code"] = "ABD"
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid["term"] = "NotInConfig"
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid["capacity"] = -1
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid["enrollment"] = -1
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(section)
            invalid["fees"] = "NotAFloat"
            validator.validate(invalid)
        validator.validate(section)
        with self.assertRaises(MultipleDefinitionsWarning):
            validator.validate(section)

        meeting = {
            "kind": "meeting",
            "course": {
                "code": "ABC"
            },
            "section": {
                "code": "001",
                "year": "2017",
                "term": "Bar"
            },
            "days": ["M", "W", "F"],
            "dates": {
                "start": "08-29-2017",
                "end": "12-10-2017",
            },
            "time": {
                "start": "14:00",
                "end": "14:50"
            },
            "location": {
                "campus": "Homewood",
                "building": "Malone",
                "room": "Ugrad"
            },
            "is_short_course": False,
        }

        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid["course"]["code"] = "ABD"
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid["section"]["code"] = "002"
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid["section"]["term"] = "InvalidTerm"
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid["section"]["year"] = "2018"
            validator.validate(invalid)
        with self.assertRaises(ValidationError):
            invalid = deepcopy(meeting)
            invalid["time"]["start"] = "15:00"
            validator.validate(invalid)
        # with self.assertRaises(ValidationWarning):
        #     invalid = deepcopy(meeting)
        #     invalid['time']['start'] = '14:50'
        #     validator.validate(invalid)
        with self.assertRaises(ValidationWarning):
            invalid = deepcopy(meeting)
            invalid["location"]["campus"] = "NotInConfigList"
            validator.validate(invalid)
        validator.validate(meeting)

        textbook_link = {
            "kind": "textbook_link",
            "school": {
                "code": "test"
            },
            "course": {
                "code": "ABC"
            },
            "section": {
                "code": "001",
                "year": "2017",
                "term": "Bar"
            },
            "isbn": "9780262033848",
            "required": True,
        }

        with self.assertRaises(ValidationError):
            invalid = deepcopy(textbook_link)
            invalid["course"]["code"] = "abc"
            validator.validate(invalid)
        validator.validate(textbook_link)
Exemple #11
0
class Ingestor(dict):
    """Ingest parsing data into formatted json.

    Mimics functionality of dict.

    Attributes:
        ALL_KEYS (set): Set of keys supported by Ingestor.
        break_on_error (bool): Break/cont on errors.
        break_on_warning (bool): Break/cont on warnings.
        school (str): School code (e.g. jhu, gw, umich).
        skip_duplicates (bool): Skip ingestion for repeated definitions.
        tracker (library.tracker): Tracker object.
        UNICODE_WHITESPACE (TYPE): regex that matches Unicode whitespace.
        validate (bool): Enable/disable validation.
        validator (library.validator): Validator instance.
    """

    ALL_KEYS = {
        "school",
        "school_subdivision_code",
        "school_subdivision_name",
        "kind",
        "department",
        "dept",
        "department_name",
        "department_code",
        "dept_name",
        "dept_code",
        "code",
        "course_code",
        "course",
        "name",
        "course_name",
        "prerequisites",
        "prereqs",
        "corequisites",
        "coreqs",
        "exclusions",
        "description",
        "descr",
        "areas",
        "level",
        "cores",
        "geneds",
        "homepage",
        "website",
        "instructors",
        "instructors",
        "instructor",
        "instr",
        "instrs",
        "instr_name",
        "instr_names",
        "instructor",
        "instructor_name",
        "section",
        "sections",
        "section_code",
        "section_name",
        "meeting_section",
        "section_type",
        "type",
        "term",
        "semester",
        "year",
        "instructors",
        "capacity",
        "size",
        "enrollment",
        "enrolment",
        "waitlist",
        "waitlist_size",
        "remaining_seats",
        "fees",
        "fee",
        "cost",
        "final_exam",
        "offerings",
        "meetings",
        "time_start",
        "start_time",
        "time_end",
        "end_time",
        "date_start",
        "date_end",
        "location",
        "loc",
        "where",
        "days",
        "day",
        "dates",
        "date",
        "time",
        "credits",
        "num_credits",
        "campus",  # TODO - not really
        "textbooks",
        "isbn",
        "required",
        "detail_url",
        "image_url",
        "author",
        "title",
        "score",
        "summary",
        "same_as",
        "pos",
        "writing_intensive",
        "sub_school",
        "course_section_id",
    }

    def __init__(
            self,
            config,
            output,
            break_on_error=True,
            break_on_warning=False,
            display_progress_bar=True,
            skip_duplicates=True,
            validate=True,
            tracker=NullTracker(),
    ):
        """Construct ingestor object and resolve options.

        Args:
            school (string): The school code (e.g. jhu, gw, umich).
            config (dict): Configuration dictionary.
            output (str, file): Output path or file object.
            break_on_error (bool, optional): Stop ingesting on error.
            break_on_warning (bool, optional): Stop ingesting on warning.
            display_progress_bar (bool, optional): display progress bar
            skip_duplicates (bool, optional): Skip ingesting courses
                that have already been seen.
            validate (bool, optional): Perform validation.
            tracker (library.tracker, optional): tracker object
        """
        self.school = config["school"]["code"]
        self.validate = validate
        self.break_on_error = break_on_error
        self.break_on_warning = break_on_warning
        self.skip_duplicates = skip_duplicates
        self.tracker = tracker
        self.hoarder = Hoarder()
        self.tracker.add_viewer(self.hoarder)
        self.tracker.school = self.school

        # Initialize loggers for json and errors.
        self.json = JSONStreamWriter(output, type_=dict).enter()
        self.data_list = self.json.write("$data", type_=list).enter()
        if self.validate:
            self.validator = Validator(config, tracker=self.tracker)

        # Inherit dictionary functionality.
        super(Ingestor, self).__init__()

    def _get(self, *keys, **kwargs):
        """Match the first key found in self dictionary.

        Note that this is purposefully not an override to __get__.
        This allows the Ingestor to maintain dictionary-like
        functionality for the API user while internally checking itself.

        Args:
            *keys: The list of keys.
            **kwargs: default return option
                TODO - Change if update to Python3

        Returns:
            The value of the key in the Ingestor instance.

        Raises:
            IngestorError: Enforce Ingestor.ALL_KEYS
        """
        default = kwargs.get("default")
        for key in keys:
            if key not in Ingestor.ALL_KEYS:
                raise IngestionWarning(key + " not in Ingestor.ALL_KEYS")
            if key not in self:
                continue
            return self[key]
        return default

    def _resolve_department(self):
        department = self._get("department")
        if "department" not in self or ("department_name" in self
                                        or "department_code" in self
                                        or "dept_name" in self
                                        or "dept_code" in self):
            # if not isinstance(self._get('department', 'dept'), dict):
            department = {
                "name": titlize(self._get("department_name", "dept_name")),
                "code": self._get("department_code", "dept_code"),
            }
        return department

    def _resolve_instructors(self):
        instructors = None
        instr_keys = (set([
            "instructors",
            "instructor",
            "instr",
            "instrs",
            "instr_name",
            "instr_names",
            "instructor",
            "instructor_name",
            "instructors",
        ])
                      & set(self))

        if len(instr_keys) == 1:
            instructors = self[list(instr_keys)[0]]
            instructors = clean(make_list(instructors))
            if instructors is not None:
                for i in range(len(instructors)):
                    if isinstance(instructors[i], str):
                        instructors[i] = {"name": instructors[i]}
        elif len(instr_keys) > 1:
            raise IngestionWarning(
                "cannot resolve instructors from keys: {}".format(
                    ",".join(instr_keys)),
                self,
            )
        return instructors

    def _resolve_date(self):
        dates = self._get("date")
        if "dates" not in self:
            dates = {
                "start": short_date(self._get("date_start")),
                "end": short_date(self._get("date_end")),
            }
        return dates

    def _resolve_time(self):
        time = self._get("time")
        if "time" not in self:
            time = {
                "start": time24(self._get("time_start", "start_time")),
                "end": time24(self._get("time_end", "end_time")),
            }
        return time

    def _resolve_location(self):
        location = self._get("location")
        if isinstance(self._get("location", "loc", "where"), str):
            location = {"where": self._get("location", "loc", "where")}
        return location

    def ingest_course(self):
        """Create course json from info in model map.

        Returns:
            dict: course
        """
        course = {
            "kind":
            "course",
            "school": {
                "code":
                self.school,
                "subdivisions": [{
                    "code": self._get("school_subdivision_code"),
                    "name": self._get("school_subdivision_name"),
                }],
            },
            "code":
            self._get("course_code", "code", "course"),
            "name":
            titlize(self._get("name", "course_name")),
            "department":
            self._resolve_department(),
            "credits":
            safe_cast(self._get("credits", "num_credits"), float, default=0.0),
            "prerequisites":
            make_list(self._get("prerequisites", "prereqs")),
            "corequisites":
            make_list(self._get("corequisites", "coreqs")),
            "exclusions":
            make_list(self._get("exclusions")),
            "areas":
            make_list(self._get("areas")),
            "level":
            self._get("level"),
            "cores":
            make_list(self._get("cores")),
            "geneds":
            make_list(self._get("geneds")),
            "sections":
            self._get("sections"),
            "homepage":
            self._get("homepage", "website"),
            "same_as":
            make_list(self._get("same_as")),
            "description":
            self._get("description", "descr"),
            "pos":
            make_list(self._get("pos")),
            "writing_intensive":
            self._get("writing_intensive"),
            "sub_school":
            self._get("sub_school"),
            # 'description': extract_info_from_text(
            #     self._get('description', 'descr'),
            #     inject=self
            # ),
        }

        course = clean(course)
        self._validate_and_log(course)
        if "department" in course:
            self.tracker.department = course["department"]
        return course

    def ingest_section(self, course):
        """Create section json object from info in model map.

        Args:
            course (dict): validated course object

        Returns:
            dict: section
        """
        section = {
            "kind": "section",
            "course": {
                "code": course.get("code")
            },
            "code": self._get("section_code", "section", "meeting_section"),
            "name": titlize(self._get("section_name")),
            "term": self._get("term", "semester"),
            "year": str(self._get("year")),
            "instructors": self._resolve_instructors(),
            "capacity": safe_cast(self._get("capacity", "size"), int),
            "enrollment": safe_cast(self._get("enrollment", "enrolment"), int),
            "waitlist": safe_cast(self._get("waitlist"), int),
            "waitlist_size": safe_cast(self._get("waitlist_size"), int),
            "remaining_seats": safe_cast(self._get("remaining_seats"), int),
            "type": self._get("type", "section_type"),
            "fees": safe_cast(self._get("fees", "fee", "cost"), float),
            "final_exam": self._get("final_exam"),
            "textbooks": self._get("textbooks"),
            "meetings": self._get("offerings", "meetings"),
            "course_section_id": safe_cast(self._get("course_section_id"),
                                           int),
        }

        section = clean(section)
        self._validate_and_log(section)
        self.tracker.year = section["year"]
        self.tracker.term = section["term"]
        return section

    def ingest_meeting(self, section, clean_only=False):
        """Create meeting ingested json map.

        Args:
            section (dict): validated section object

        Returns:
            dict: meeting
        """
        year = str(self._get("year"))
        term = self._get("term", "semester")
        if section.get("code") is None:
            year = None
            term = None

        meeting = {
            "kind": "meeting",
            "course": section.get("course"),
            "section": {
                "code": section.get("code"),
                "year": year,
                "term": term,
            },
            "days": make_list(self._get("days", "day")),
            "dates": self._resolve_date(),
            "time": self._resolve_time(),
            "location": self._resolve_location(),
        }

        meeting = clean(meeting)

        if clean_only:
            return meeting

        self._validate_and_log(meeting)
        if "time" in meeting:
            self.tracker.time = meeting["time"]["start"]
            self.tracker.time = meeting["time"]["end"]
        return meeting

    def ingest_textbook_link(self, section=None):
        """Create textbook link json object.

        Args:
            section (None, :obj:`dict`, optional): Description
        Returns:
            dict: textbook link.
        """
        textbook_link = {
            "kind": "textbook_link",
            "school": {
                "code": self._get("school", "school_code")
            },
            "course": {
                "code": self._get("course_code")
            },
            "section": {
                "code": self._get("section_code"),
                "year": str(self._get("year")),
                "term": self._get("term", "semester"),
            },
            "isbn": self._get("isbn"),
            "required": self._get("required"),
        }

        textbook_link = clean(textbook_link)
        self._validate_and_log(textbook_link)
        self.tracker.year = textbook_link["section"]["year"]
        self.tracker.term = textbook_link["section"]["term"]
        if "department" in self:
            self.tracker.department = self["department"]
        return textbook_link

    def ingest_textbook(self):
        """Create textbook json object.

        Returns:
            dict: textbook
        """
        textbook = {
            "kind": "textbook",
            "isbn": self._get("isbn"),
            "detail_url": self._get("detail_url"),
            "image_url": self._get("image_url"),
            "author": self._get("author"),
            "title": self._get("title"),
        }

        textbook = clean(textbook)
        self._validate_and_log(textbook)
        if "department" in self:
            self.tracker.department = self["department"]
        return textbook

    def ingest_eval(self):
        """Create evaluation json object.

        Returns:
            dict: eval
        """
        evaluation = {
            "kind": "eval",
            "year": str(self._get("year")),
            "term": self._get("term"),
            "score": float(self._get("score")),
            "instructors": self._resolve_instructors(),
            "course": {
                "code": self._get("course_code")
            },
            "summary": self._get("summary"),
        }

        evaluation = clean(evaluation)
        self._validate_and_log(evaluation)
        self.tracker.year = evaluation["year"]
        self.tracker.term = evaluation["term"]
        return evaluation

    def end(self):
        """Finish ingesting.

        Close i/o, clear internal state, write meta info
        """
        self.data_list.exit()
        self.json.write(
            "$meta",
            {
                "$schools": self.hoarder.schools,
                "$timestamp": self.tracker.start_time
            },
        )
        self.json.exit()
        self.clear()

    def _validate_and_log(self, obj):
        if self.validate is False:
            self.data_list.write(obj)
            self.tracker.stats = dict(kind=obj["kind"], status="total")
            return

        is_valid, skip = self._run_validator(obj)
        if skip:
            return
        if is_valid:
            self.data_list.write(obj)
        try:
            for key in self:
                if key in Ingestor.ALL_KEYS:
                    continue
                raise IngestionWarning(
                    self, "ingestor does not support key {}: {}".format(
                        key, self[key]))
        except IngestionWarning as e:
            is_valid = True
            logging.exception("Ingestor warning")
            if self.break_on_warning:
                raise e
        self.tracker.stats = dict(kind=obj["kind"], status="total")

    def _run_validator(self, data):
        is_valid = False
        full_skip = False

        logger = logging.getLogger("parsing.schools." + self.school)

        try:
            self.validator.validate(data)
            self.tracker.stats = dict(kind=data["kind"], status="valid")
            is_valid = True
        except ValidationError as e:
            if self.break_on_error:
                raise ValidationError(*e.args)
            else:
                logger.warning("Ingestion failed", exc_info=True)
                logger.debug("Ingestor dump", self)
        except ValidationWarning as e:
            if isinstance(e,
                          MultipleDefinitionsWarning) and self.skip_duplicates:
                full_skip = True
            else:
                is_valid = True
                if self.break_on_warning:
                    raise ValidationWarning(*e.args)
                else:
                    logger.warning("Validation warning", exc_info=True)
                    logger.debug("Ingestor dump", self)

        return is_valid, full_skip